<a href="https://colab.research.google.com/github/KapilM26/Insider-Threat-Detection/blob/main/one_svm2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [75]:
# %%
import csv
import os
from collections import defaultdict
from datetime import datetime, time, timedelta

import pandas as pd


import os
import csv
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict

def get_user_usb_data(user_id, dataset_path):
    usb_data = []
    with open(os.path.join(dataset_path, "/device.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            if row[2] == user_id and row[5] == "Connect":  # Check user and only "Connect" activity
                usb_data.append(row)
    return usb_data

def get_num_usb_insertions_per_week(user, usb_data):
    weekly_usb_counts = defaultdict(int)
    all_weeks = set()
    for row in usb_data:
        file_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")
        week = file_time.strftime("%Y-%W")
        all_weeks.add(week)
        weekly_usb_counts[week] += 1

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks, default=None)
    max_week = max(all_weeks, default=None)
    if min_week and max_week:
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        weekly_counts = {week: weekly_usb_counts.get(week, 0) for week in complete_weeks}
    else:
        weekly_counts = {}

    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_usb_insertions"])


def get_user_exe_data(user_id, dataset_path):
    exe_data = []
    with open(os.path.join(dataset_path, "/file.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            if row[2] == user_id and row[4].endswith(".exe"):  # Check user and .exe files
                exe_data.append(row)
    return exe_data

def get_num_exe_per_week(user, exe_data):
    weekly_exe_counts = defaultdict(int)
    all_weeks = set()

    for row in exe_data:
        file_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")
        week = file_time.strftime("%Y-%W")
        all_weeks.add(week)
        weekly_exe_counts[week] += 1

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks, default=None)
    max_week = max(all_weeks, default=None)

    if min_week and max_week:
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        weekly_counts = {week: weekly_exe_counts.get(week, 0) for week in complete_weeks}
    else:
        weekly_counts = {}

    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_exe_files"])

# %%
def get_user_logon_data(user_id, dataset_path):
    logon_data = []
    with open(os.path.join(dataset_path, "/logon.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if row[2] == user_id:
                logon_data.append(row)
    return logon_data


# %%
def get_user_pc(logon_data):
    pc_dict = {}
    for row in logon_data:
        pc_dict[row[3]] = 1 + pc_dict.get(row[3], 0)
    user_pc = max(pc_dict, key=pc_dict.get)
    return user_pc


# %%
def get_num_other_PC_per_week(user, user_pc, logon_data):
    weekly_pc_counts = defaultdict(set)  # Dictionary to store unique PCs per week
    all_weeks = set()  # Set to track all weeks where logons occurred

    for row in logon_data:
        logon_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")  # Adjusted format
        week = logon_time.strftime("%Y-%W")  # Year-Week format
        all_weeks.add(week)  # Track all weeks

        if row[3] != user_pc:  # Check if PC is different from user's primary PC
            weekly_pc_counts[week].add(
                row[3]
            )  # Add PC to the week's set (unique values only)

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks)
    max_week = max(all_weeks)

    # Generate all weeks between min and max
    start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
    end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

    current_date = start_date
    complete_weeks = set()

    while current_date <= end_date:
        week_str = current_date.strftime("%Y-%W")
        complete_weeks.add(week_str)
        current_date += timedelta(days=7)

    # Ensure every week has a count (0 if no other PCs were accessed)
    weekly_counts = {
        week: len(weekly_pc_counts[week]) if week in weekly_pc_counts else 0
        for week in complete_weeks
    }

    # Convert to DataFrame
    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_other_pc"])


def get_after_hours_logons(
    logon_data, user, business_start=time(9, 0, 0), business_end=time(17, 0, 0)
):
    """
    Aggregates after-hours logons per week for a specified user.

    :param logon_data: List of logon events in the format [id, date, user, pc, activity]
    :param user: The specific user to filter logon events for.
    :param business_start: Datetime.time representing start of business hours.
    :param business_end: Datetime.time representing end of business hours.
    :return: DataFrame with ['user', 'week', 'after_hours_logons']
    """

    after_hours_counts = defaultdict(int)

    # Track all weeks for the user
    all_weeks = set()

    for row in logon_data:
        logon_id, timestamp, logon_user, pc, activity = row  # Unpack columns

        if (
            activity.lower() == "logon" and logon_user == user
        ):  # Only process logons for the specified user
            try:
                logon_time = datetime.strptime(timestamp, "%m/%d/%Y %H:%M:%S")
                logon_week = logon_time.strftime("%Y-%W")  # Ensure same format

                # Store this week to ensure it's included in results
                all_weeks.add(logon_week)

                # Extract only the time component
                logon_hour = logon_time.time()

                # Check if the logon occurred outside business hours
                if logon_hour < business_start or logon_hour >= business_end:
                    after_hours_counts[logon_week] += 1

            except ValueError:
                continue  # Skip invalid timestamps

    # Ensure all weeks in range are included (like `get_num_other_PC_per_week`)
    if all_weeks:
        min_week = min(all_weeks)
        max_week = max(all_weeks)

        # Generate all weeks in range
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        # Fill in missing weeks with 0
        after_hours_counts = {
            week: after_hours_counts.get(week, 0) for week in complete_weeks
        }

    # Convert to DataFrame
    result_data = [
        (user, week, after_hours_counts[week])
        for week in sorted(after_hours_counts.keys())
    ]
    after_hours_df = pd.DataFrame(
        result_data, columns=["user", "week", "after_hours_logons"]
    )

    return after_hours_df


# %%
def find_insider_answers_file(user, insider_root):
    """
    Recursively searches for the insider CSV file for the given user in the `insider_root` directory.

    :param user: The user ID (e.g., "CWW1120")
    :param insider_root: The root folder containing multiple r5.2-* subfolders.
    :return: The full path to the user's insider CSV file if found, else None.
    """
    for root, _, files in os.walk(insider_root):
        for file in files:
            if file.startswith(f"/content/r5.2-") and file.endswith(
                f"-{user}.csv"
            ):  # Match user file format
                return os.path.join(root, file)  # Return full file path if found
    return None  # Return None if no file is found


def extract_weeks_from_csv(file_path):
    """
    Reads a CSV file using `csv.reader` and extracts unique weeks from the timestamps (3rd column).

    :param file_path: Path to the insider CSV file.
    :return: A set of detected `Year-Week` values.
    """
    insider_weeks = set()

    try:
        with open(file_path, mode="r", newline="", encoding="utf-8") as file:
            reader = csv.reader(file)
            for row in reader:
                if len(row) < 3:  # Ensure the timestamp column exists
                    continue
                try:
                    logon_time = datetime.strptime(
                        row[2], "%m/%d/%Y %H:%M:%S"
                    )  # Parse timestamp
                    week = logon_time.strftime("%Y-%W")  # Convert to Year-Week format
                    insider_weeks.add(week)
                except ValueError:
                    continue  # Skip rows with invalid timestamps
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

    return insider_weeks


def label_insider_weeks(df, user, insider_root):
    """
    Adds an 'insider' column to the DataFrame by checking if the user's week exists in their insider file.

    :param df: DataFrame containing ['user', 'week', 'num_other_pc']
    :param user: The user ID for whom the dataframe is filtered.
    :param insider_root: Path to the folder containing multiple r5.2-* subfolders.
    :return: DataFrame with an 'insider' column.
    """

    # Locate the user's insider file
    insider_file = find_insider_answers_file(user, insider_root)

    # If no insider file exists for the user, mark all weeks as 0 (not insider)
    if not insider_file:
        df["insider"] = 0
        return df

    # Extract weeks from the insider CSV file
    insider_weeks = extract_weeks_from_csv(insider_file)

    # Label insider weeks in the user's dataframe
    df["insider"] = df["week"].apply(lambda w: 1 if w in insider_weeks else 0)

    return df


def combine_user_feature_data(user, dataset_path, insider_root):
    # Get data from different feature functions
    logon_data = get_user_logon_data(user, dataset_path)
    user_pc = get_user_pc(logon_data)
    num_other_pc = get_num_other_PC_per_week(user, user_pc, logon_data)
    after_hours_logons = get_after_hours_logons(logon_data, user)

    exe_data = get_user_exe_data(user, dataset_path)
    num_exe_files = get_num_exe_per_week(user, exe_data)

    usb_data = get_user_usb_data(user, dataset_path)
    num_usb = get_num_usb_insertions_per_week(user, usb_data)

    # Extract relevant columns
    after_hours_df = after_hours_logons[["week", "after_hours_logons"]]
    exe_df         = num_exe_files[["week", "num_exe_files"]]
    usb_df         = num_usb[["week", "num_usb_insertions"]]
    other_pc_df    = num_other_pc[["week", "num_other_pc"]]

    # Merge all dataframes on "week" using an outer join
    merged_df = after_hours_df.merge(exe_df, on="week", how="outer") \
                              .merge(usb_df, on="week", how="outer") \
                              .merge(other_pc_df, on="week", how="outer")

    # Replace NaN with 0 in all feature columns
    merged_df.fillna(0, inplace=True)

    # Add user column
    merged_df.insert(0, "user", user)
    labeled_df = label_insider_weeks(merged_df, user, insider_root)
    return labeled_df

# Example usage
dataset_path = os.path.join("Insider threat dataset", "r5.2")
user = "MIB0203"
file_path = "/MIB0203.csv"  # Change to your actual file path
final_df = pd.read_csv(file_path)
# final_df = combine_user_feature_data(user, dataset_path, insider_root)
print(final_df)

       user     week  after_hours_logons  num_exe_files  num_usb_insertions  \
0   MIB0203  2010-01                   5              0                 0.0   
1   MIB0203  2010-02                   7              0                 0.0   
2   MIB0203  2010-03                   6              0                 0.0   
3   MIB0203  2010-04                   5              0                 0.0   
4   MIB0203  2010-05                   5              0                 0.0   
5   MIB0203  2010-06                   6              0                 0.0   
6   MIB0203  2010-07                   5              0                 0.0   
7   MIB0203  2010-08                   5              0                 0.0   
8   MIB0203  2010-09                   6              0                 0.0   
9   MIB0203  2010-10                   5              0                 0.0   
10  MIB0203  2010-11                   5              0                 0.0   
11  MIB0203  2010-12                   5            

In [76]:

# # Access the 'num_other_pc' data from the 'final_df' DataFrame
# num_other_pc_data = final_df[['num_other_pc']]

# # Initialize the Isolation Forest model
# iso_forest = IsolationForest(contamination='auto', random_state=42)

# # Fit the model
# iso_forest.fit(num_other_pc_data)

# # Predict anomalies and add results to the 'final_df' DataFrame
# final_df['anomaly'] = iso_forest.predict(num_other_pc_data)
# final_df['anomaly_score'] = iso_forest.decision_function(num_other_pc_data)

# # Convert anomaly labels to boolean (1 for anomaly, -1 for normal)
# final_df['anomaly'] = final_df['anomaly'].apply(lambda x: 1 if x == -1 else 0)

# # Display the rows where anomalies were detected
# print(final_df[final_df['anomaly'] == 1])

In [77]:
print(pd.Series(y_pred).value_counts())


1    16
Name: count, dtype: int64


In [78]:
import numpy as np
import pandas as pd
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score

# Load your dataset (assuming final_df is already loaded from the uploaded data)
# If final_df is missing, make sure you load it using the proper dataset
df = pd.read_csv('/MIB0203.csv')  # Make sure the path is correct for your environment

# Select Features
feature_cols = ["after_hours_logons", "num_exe_files", "num_usb_insertions", "num_other_pc"]
X = df[feature_cols]  # Using df instead of final_df
y_true = df["insider"]  # Ground truth for evaluation

# Normalize Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train One-Class SVM with a Higher `nu`
oc_svm = OneClassSVM(kernel="rbf", gamma="scale", nu=0.4)  # Increased nu from 0.2 to 0.4
oc_svm.fit(X_scaled)

# Predict Anomalies
decision_scores = oc_svm.decision_function(X_scaled)

# Set a Fixed Threshold (e.g., 0.2 instead of percentile)
threshold = 0.2  # Change this to any value you prefer
y_pred = [1 if score < threshold else 0 for score in decision_scores]

# Calculate Metrics
precision = precision_score(y_true, y_pred, pos_label=1, zero_division=1)
recall = recall_score(y_true, y_pred, pos_label=1, zero_division=1)
f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=1)

# Print Results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Debug: Print Prediction Counts
print("Predicted Labels:\n", pd.Series(y_pred).value_counts())
print("True Labels:\n", pd.Series(y_true).value_counts())


Precision: 0.0455
Recall: 1.0000
F1-score: 0.0870
Predicted Labels:
 1    44
Name: count, dtype: int64
True Labels:
 insider
0    42
1     2
Name: count, dtype: int64


In [79]:
import numpy as np
import pandas as pd
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score

# Select Features
feature_cols = ["after_hours_logons", "num_exe_files", "num_usb_insertions", "num_other_pc"]
X = final_df[feature_cols]
y_true = final_df["insider"]

# Normalize Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train One-Class SVM with a Higher `nu`
oc_svm = OneClassSVM(kernel="rbf", gamma="scale", nu=0.4)  # Increased from 0.2 to 0.4
oc_svm.fit(X_scaled)

# Predict Anomalies
decision_scores = oc_svm.decision_function(X_scaled)


# Set a Fixed Threshold (e.g., 0.2 instead of percentile)
threshold = 0.5  # Change this to any value you prefer
y_pred = [1 if score < threshold else 0 for score in decision_scores]

# Calculate Metrics
precision = precision_score(y_true, y_pred, pos_label=1, zero_division=1)
recall = recall_score(y_true, y_pred, pos_label=1, zero_division=1)
f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=1)

# Print Results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Debug: Print Prediction Counts
print("Predicted Labels:\n", pd.Series(y_pred).value_counts())
print("True Labels:\n", pd.Series(y_true).value_counts())


Precision: 0.0455
Recall: 1.0000
F1-score: 0.0870
Predicted Labels:
 1    44
Name: count, dtype: int64
True Labels:
 insider
0    42
1     2
Name: count, dtype: int64


In [80]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score

#  Select Features for Training
feature_cols = ["after_hours_logons", "num_exe_files", "num_usb_insertions", "num_other_pc"]
X = final_df[feature_cols]
y_true = final_df["insider"]  # Ground truth (0 = normal, 1 = anomaly)

# Normalize the Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#  Train One-Class SVM
oc_svm = OneClassSVM(kernel="rbf", gamma="scale", nu=0.9)  # Adjust nu for anomaly proportion
oc_svm.fit(X_scaled)

# Predict Anomalies
decision_scores = oc_svm.decision_function(X_scaled)  # Get decision function scores
threshold = np.percentile(decision_scores, 90)  # Detect top 10% as anomalies
y_pred = [1 if score < threshold else 0 for score in decision_scores]

#  Calculate Evaluation Metrics
precision = precision_score(y_true, y_pred, pos_label=1)
recall = recall_score(y_true, y_pred, pos_label=1)
f1 = f1_score(y_true, y_pred, pos_label=1)

#  Print Results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


Precision: 0.1333
Recall: 1.0000
F1-score: 0.2353


In [81]:
# Define fixed threshold values for anomaly detection
thresholds = [-0.2, -0.1, 0, 0.1, 0.2]  # Try different fixed thresholds

# Loop through each threshold and compute precision, recall, F1-score
for threshold in thresholds:
    y_pred = np.where(decision_scores < threshold, 1, 0)  # Label as "1" if below threshold (anomaly)

    precision = precision_score(y_true, y_pred, pos_label=1, zero_division=1)
    recall = recall_score(y_true, y_pred, pos_label=1, zero_division=1)
    f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=1)

    print(f"Threshold = {threshold:.4f}: Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1:.4f}")

Threshold = -0.2000: Precision = 0.1333, Recall = 1.0000, F1 Score = 0.2353
Threshold = -0.1000: Precision = 0.1333, Recall = 1.0000, F1 Score = 0.2353
Threshold = 0.0000: Precision = 0.0455, Recall = 1.0000, F1 Score = 0.0870
Threshold = 0.1000: Precision = 0.0455, Recall = 1.0000, F1 Score = 0.0870
Threshold = 0.2000: Precision = 0.0455, Recall = 1.0000, F1 Score = 0.0870


In [82]:
final_df[final_df["num_usb_insertions"]!=0].head()

Unnamed: 0,user,week,after_hours_logons,num_exe_files,num_usb_insertions,num_other_pc,insider
40,MIB0203,2010-41,7,0,1.0,0,1
41,MIB0203,2010-42,7,0,1.0,1,1


In [83]:
# %%
import csv
import os
from collections import defaultdict
from datetime import datetime, time, timedelta

import pandas as pd


import os
import csv
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict

def get_user_usb_data(user_id, dataset_path):
    usb_data = []
    with open(os.path.join(dataset_path, "/device.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            if row[2] == user_id and row[5] == "Connect":  # Check user and only "Connect" activity
                usb_data.append(row)
    return usb_data

def get_num_usb_insertions_per_week(user, usb_data):
    weekly_usb_counts = defaultdict(int)
    all_weeks = set()
    for row in usb_data:
        file_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")
        week = file_time.strftime("%Y-%W")
        all_weeks.add(week)
        weekly_usb_counts[week] += 1

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks, default=None)
    max_week = max(all_weeks, default=None)
    if min_week and max_week:
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        weekly_counts = {week: weekly_usb_counts.get(week, 0) for week in complete_weeks}
    else:
        weekly_counts = {}

    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_usb_insertions"])


def get_user_exe_data(user_id, dataset_path):
    exe_data = []
    with open(os.path.join(dataset_path, "/file.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            if row[2] == user_id and row[4].endswith(".exe"):  # Check user and .exe files
                exe_data.append(row)
    return exe_data

def get_num_exe_per_week(user, exe_data):
    weekly_exe_counts = defaultdict(int)
    all_weeks = set()

    for row in exe_data:
        file_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")
        week = file_time.strftime("%Y-%W")
        all_weeks.add(week)
        weekly_exe_counts[week] += 1

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks, default=None)
    max_week = max(all_weeks, default=None)

    if min_week and max_week:
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        weekly_counts = {week: weekly_exe_counts.get(week, 0) for week in complete_weeks}
    else:
        weekly_counts = {}

    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_exe_files"])

# %%
def get_user_logon_data(user_id, dataset_path):
    logon_data = []
    with open(os.path.join(dataset_path, "/logon.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if row[2] == user_id:
                logon_data.append(row)
    return logon_data


# %%
def get_user_pc(logon_data):
    pc_dict = {}
    for row in logon_data:
        pc_dict[row[3]] = 1 + pc_dict.get(row[3], 0)
    user_pc = max(pc_dict, key=pc_dict.get)
    return user_pc


# %%
def get_num_other_PC_per_week(user, user_pc, logon_data):
    weekly_pc_counts = defaultdict(set)  # Dictionary to store unique PCs per week
    all_weeks = set()  # Set to track all weeks where logons occurred

    for row in logon_data:
        logon_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")  # Adjusted format
        week = logon_time.strftime("%Y-%W")  # Year-Week format
        all_weeks.add(week)  # Track all weeks

        if row[3] != user_pc:  # Check if PC is different from user's primary PC
            weekly_pc_counts[week].add(
                row[3]
            )  # Add PC to the week's set (unique values only)

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks)
    max_week = max(all_weeks)

    # Generate all weeks between min and max
    start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
    end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

    current_date = start_date
    complete_weeks = set()

    while current_date <= end_date:
        week_str = current_date.strftime("%Y-%W")
        complete_weeks.add(week_str)
        current_date += timedelta(days=7)

    # Ensure every week has a count (0 if no other PCs were accessed)
    weekly_counts = {
        week: len(weekly_pc_counts[week]) if week in weekly_pc_counts else 0
        for week in complete_weeks
    }

    # Convert to DataFrame
    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_other_pc"])


def get_after_hours_logons(
    logon_data, user, business_start=time(9, 0, 0), business_end=time(17, 0, 0)
):
    """
    Aggregates after-hours logons per week for a specified user.

    :param logon_data: List of logon events in the format [id, date, user, pc, activity]
    :param user: The specific user to filter logon events for.
    :param business_start: Datetime.time representing start of business hours.
    :param business_end: Datetime.time representing end of business hours.
    :return: DataFrame with ['user', 'week', 'after_hours_logons']
    """

    after_hours_counts = defaultdict(int)

    # Track all weeks for the user
    all_weeks = set()

    for row in logon_data:
        logon_id, timestamp, logon_user, pc, activity = row  # Unpack columns

        if (
            activity.lower() == "logon" and logon_user == user
        ):  # Only process logons for the specified user
            try:
                logon_time = datetime.strptime(timestamp, "%m/%d/%Y %H:%M:%S")
                logon_week = logon_time.strftime("%Y-%W")  # Ensure same format

                # Store this week to ensure it's included in results
                all_weeks.add(logon_week)

                # Extract only the time component
                logon_hour = logon_time.time()

                # Check if the logon occurred outside business hours
                if logon_hour < business_start or logon_hour >= business_end:
                    after_hours_counts[logon_week] += 1

            except ValueError:
                continue  # Skip invalid timestamps

    # Ensure all weeks in range are included (like `get_num_other_PC_per_week`)
    if all_weeks:
        min_week = min(all_weeks)
        max_week = max(all_weeks)

        # Generate all weeks in range
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        # Fill in missing weeks with 0
        after_hours_counts = {
            week: after_hours_counts.get(week, 0) for week in complete_weeks
        }

    # Convert to DataFrame
    result_data = [
        (user, week, after_hours_counts[week])
        for week in sorted(after_hours_counts.keys())
    ]
    after_hours_df = pd.DataFrame(
        result_data, columns=["user", "week", "after_hours_logons"]
    )

    return after_hours_df


# %%
def find_insider_answers_file(user, insider_root):
    """
    Recursively searches for the insider CSV file for the given user in the `insider_root` directory.

    :param user: The user ID (e.g., "CWW1120")
    :param insider_root: The root folder containing multiple r5.2-* subfolders.
    :return: The full path to the user's insider CSV file if found, else None.
    """
    for root, _, files in os.walk(insider_root):
        for file in files:
            if file.startswith(f"/content/r5.2-") and file.endswith(
                f"-{user}.csv"
            ):  # Match user file format
                return os.path.join(root, file)  # Return full file path if found
    return None  # Return None if no file is found


def extract_weeks_from_csv(file_path):
    """
    Reads a CSV file using `csv.reader` and extracts unique weeks from the timestamps (3rd column).

    :param file_path: Path to the insider CSV file.
    :return: A set of detected `Year-Week` values.
    """
    insider_weeks = set()

    try:
        with open(file_path, mode="r", newline="", encoding="utf-8") as file:
            reader = csv.reader(file)
            for row in reader:
                if len(row) < 3:  # Ensure the timestamp column exists
                    continue
                try:
                    logon_time = datetime.strptime(
                        row[2], "%m/%d/%Y %H:%M:%S"
                    )  # Parse timestamp
                    week = logon_time.strftime("%Y-%W")  # Convert to Year-Week format
                    insider_weeks.add(week)
                except ValueError:
                    continue  # Skip rows with invalid timestamps
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

    return insider_weeks


def label_insider_weeks(df, user, insider_root):
    """
    Adds an 'insider' column to the DataFrame by checking if the user's week exists in their insider file.

    :param df: DataFrame containing ['user', 'week', 'num_other_pc']
    :param user: The user ID for whom the dataframe is filtered.
    :param insider_root: Path to the folder containing multiple r5.2-* subfolders.
    :return: DataFrame with an 'insider' column.
    """

    # Locate the user's insider file
    insider_file = find_insider_answers_file(user, insider_root)

    # If no insider file exists for the user, mark all weeks as 0 (not insider)
    if not insider_file:
        df["insider"] = 0
        return df

    # Extract weeks from the insider CSV file
    insider_weeks = extract_weeks_from_csv(insider_file)

    # Label insider weeks in the user's dataframe
    df["insider"] = df["week"].apply(lambda w: 1 if w in insider_weeks else 0)

    return df


def combine_user_feature_data(user, dataset_path, insider_root):
    # Get data from different feature functions
    logon_data = get_user_logon_data(user, dataset_path)
    user_pc = get_user_pc(logon_data)
    num_other_pc = get_num_other_PC_per_week(user, user_pc, logon_data)
    after_hours_logons = get_after_hours_logons(logon_data, user)

    exe_data = get_user_exe_data(user, dataset_path)
    num_exe_files = get_num_exe_per_week(user, exe_data)

    usb_data = get_user_usb_data(user, dataset_path)
    num_usb = get_num_usb_insertions_per_week(user, usb_data)

    # Extract relevant columns
    after_hours_df = after_hours_logons[["week", "after_hours_logons"]]
    exe_df         = num_exe_files[["week", "num_exe_files"]]
    usb_df         = num_usb[["week", "num_usb_insertions"]]
    other_pc_df    = num_other_pc[["week", "num_other_pc"]]

    # Merge all dataframes on "week" using an outer join
    merged_df = after_hours_df.merge(exe_df, on="week", how="outer") \
                              .merge(usb_df, on="week", how="outer") \
                              .merge(other_pc_df, on="week", how="outer")

    # Replace NaN with 0 in all feature columns
    merged_df.fillna(0, inplace=True)

    # Add user column
    merged_df.insert(0, "user", user)
    labeled_df = label_insider_weeks(merged_df, user, insider_root)
    return labeled_df

# Example usage
dataset_path = os.path.join("Insider threat dataset", "r5.2")
user = "ALT1465"
file_path = "/ALT1465.csv"  # Change to your actual file path
final_df = pd.read_csv(file_path)
# final_df = combine_user_feature_data(user, data
print(final_df)

       user     week  after_hours_logons  num_exe_files  num_usb_insertions  \
0   ALT1465  2010-01                   0              0                 0.0   
1   ALT1465  2010-02                   4              0                 0.0   
2   ALT1465  2010-03                   4              0                 0.0   
3   ALT1465  2010-04                   4              0                 0.0   
4   ALT1465  2010-05                   1              0                 0.0   
5   ALT1465  2010-06                   0              0                 0.0   
6   ALT1465  2010-07                   5              0                 0.0   
7   ALT1465  2010-08                   2              0                 0.0   
8   ALT1465  2010-09                   2              0                 0.0   
9   ALT1465  2010-10                   5              0                 0.0   
10  ALT1465  2010-11                   2              0                 0.0   
11  ALT1465  2010-12                   3            

In [84]:
import numpy as np
import pandas as pd
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score

# Load your dataset (assuming final_df is already loaded from the uploaded data)
# If final_df is missing, make sure you load it using the proper dataset
df = pd.read_csv('/ALT1465.csv')  # Make sure the path is correct for your environment

# Select Features
feature_cols = ["after_hours_logons", "num_exe_files", "num_usb_insertions", "num_other_pc"]
X = df[feature_cols]  # Using df instead of final_df
y_true = df["insider"]  # Ground truth for evaluation

# Normalize Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train One-Class SVM with a Higher `nu`
oc_svm = OneClassSVM(kernel="rbf", gamma="scale", nu=0.4)  # Increased nu from 0.2 to 0.4
oc_svm.fit(X_scaled)

# Predict Anomalies
decision_scores = oc_svm.decision_function(X_scaled)

# Set a Fixed Threshold (e.g., 0.2 instead of percentile)
threshold = 0.2  # Change this to any value you prefer
y_pred = [1 if score < threshold else 0 for score in decision_scores]

# Calculate Metrics
precision = precision_score(y_true, y_pred, pos_label=1, zero_division=1)
recall = recall_score(y_true, y_pred, pos_label=1, zero_division=1)
f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=1)

# Print Results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Debug: Print Prediction Counts
print("Predicted Labels:\n", pd.Series(y_pred).value_counts())
print("True Labels:\n", pd.Series(y_true).value_counts())

Precision: 0.1000
Recall: 1.0000
F1-score: 0.1818
Predicted Labels:
 1    20
0    16
Name: count, dtype: int64
True Labels:
 insider
0    34
1     2
Name: count, dtype: int64


In [85]:

# Load your dataset (assuming final_df is already loaded from the uploaded data)
# If final_df is missing, make sure you load it using the proper dataset
df = pd.read_csv('/ALT1465.csv')  # Make sure the path is correct for your environment

# Select Features
feature_cols = ["after_hours_logons", "num_exe_files", "num_usb_insertions", "num_other_pc"]
X = df[feature_cols]  # Using df instead of final_df
y_true = df["insider"]  # Ground truth for evaluation

# Normalize Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train One-Class SVM with a Higher `nu`
oc_svm = OneClassSVM(kernel="rbf", gamma="scale", nu=0.4)  # Increased nu from 0.2 to 0.4
oc_svm.fit(X_scaled)

# Predict Anomalies
decision_scores = oc_svm.decision_function(X_scaled)

# Set a Fixed Threshold
threshold = 0.6  # Change this to any value you prefer
y_pred = [1 if score < threshold else 0 for score in decision_scores]

# Calculate Metrics
precision = precision_score(y_true, y_pred, pos_label=1, zero_division=1)
recall = recall_score(y_true, y_pred, pos_label=1, zero_division=1)
f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=1)

# Print Results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Debug: Print Prediction Counts
print("Predicted Labels:\n", pd.Series(y_pred).value_counts())
print("True Labels:\n", pd.Series(y_true).value_counts())

Precision: 0.0556
Recall: 1.0000
F1-score: 0.1053
Predicted Labels:
 1    36
Name: count, dtype: int64
True Labels:
 insider
0    34
1     2
Name: count, dtype: int64


In [86]:
# Define fixed threshold values for anomaly detection
thresholds = [-0.2, -0.1, 0, 0.1, 0.2]  # Try different fixed thresholds

# Loop through each threshold and compute precision, recall, F1-score
for threshold in thresholds:
    y_pred = np.where(decision_scores < threshold, 1, 0)  # Label as "1" if below threshold (anomaly)

    precision = precision_score(y_true, y_pred, pos_label=1, zero_division=1)
    recall = recall_score(y_true, y_pred, pos_label=1, zero_division=1)
    f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=1)

    print(f"Threshold = {threshold:.4f}: Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1:.4f}")

Threshold = -0.2000: Precision = 0.3333, Recall = 1.0000, F1 Score = 0.5000
Threshold = -0.1000: Precision = 0.2857, Recall = 1.0000, F1 Score = 0.4444
Threshold = 0.0000: Precision = 0.2000, Recall = 1.0000, F1 Score = 0.3333
Threshold = 0.1000: Precision = 0.1250, Recall = 1.0000, F1 Score = 0.2222
Threshold = 0.2000: Precision = 0.1000, Recall = 1.0000, F1 Score = 0.1818


In [94]:
# %%
import csv
import os
from collections import defaultdict
from datetime import datetime, time, timedelta

import pandas as pd


import os
import csv
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict

def get_user_usb_data(user_id, dataset_path):
    usb_data = []
    with open(os.path.join(dataset_path, "/device.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            if row[2] == user_id and row[5] == "Connect":  # Check user and only "Connect" activity
                usb_data.append(row)
    return usb_data

def get_num_usb_insertions_per_week(user, usb_data):
    weekly_usb_counts = defaultdict(int)
    all_weeks = set()
    for row in usb_data:
        file_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")
        week = file_time.strftime("%Y-%W")
        all_weeks.add(week)
        weekly_usb_counts[week] += 1

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks, default=None)
    max_week = max(all_weeks, default=None)
    if min_week and max_week:
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        weekly_counts = {week: weekly_usb_counts.get(week, 0) for week in complete_weeks}
    else:
        weekly_counts = {}

    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_usb_insertions"])


def get_user_exe_data(user_id, dataset_path):
    exe_data = []
    with open(os.path.join(dataset_path, "/file.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            if row[2] == user_id and row[4].endswith(".exe"):  # Check user and .exe files
                exe_data.append(row)
    return exe_data

def get_num_exe_per_week(user, exe_data):
    weekly_exe_counts = defaultdict(int)
    all_weeks = set()

    for row in exe_data:
        file_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")
        week = file_time.strftime("%Y-%W")
        all_weeks.add(week)
        weekly_exe_counts[week] += 1

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks, default=None)
    max_week = max(all_weeks, default=None)

    if min_week and max_week:
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        weekly_counts = {week: weekly_exe_counts.get(week, 0) for week in complete_weeks}
    else:
        weekly_counts = {}

    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_exe_files"])

# %%
def get_user_logon_data(user_id, dataset_path):
    logon_data = []
    with open(os.path.join(dataset_path, "/logon.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if row[2] == user_id:
                logon_data.append(row)
    return logon_data


# %%
def get_user_pc(logon_data):
    pc_dict = {}
    for row in logon_data:
        pc_dict[row[3]] = 1 + pc_dict.get(row[3], 0)
    user_pc = max(pc_dict, key=pc_dict.get)
    return user_pc


# %%
def get_num_other_PC_per_week(user, user_pc, logon_data):
    weekly_pc_counts = defaultdict(set)  # Dictionary to store unique PCs per week
    all_weeks = set()  # Set to track all weeks where logons occurred

    for row in logon_data:
        logon_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")  # Adjusted format
        week = logon_time.strftime("%Y-%W")  # Year-Week format
        all_weeks.add(week)  # Track all weeks

        if row[3] != user_pc:  # Check if PC is different from user's primary PC
            weekly_pc_counts[week].add(
                row[3]
            )  # Add PC to the week's set (unique values only)

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks)
    max_week = max(all_weeks)

    # Generate all weeks between min and max
    start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
    end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

    current_date = start_date
    complete_weeks = set()

    while current_date <= end_date:
        week_str = current_date.strftime("%Y-%W")
        complete_weeks.add(week_str)
        current_date += timedelta(days=7)

    # Ensure every week has a count (0 if no other PCs were accessed)
    weekly_counts = {
        week: len(weekly_pc_counts[week]) if week in weekly_pc_counts else 0
        for week in complete_weeks
    }

    # Convert to DataFrame
    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_other_pc"])


def get_after_hours_logons(
    logon_data, user, business_start=time(9, 0, 0), business_end=time(17, 0, 0)
):
    """
    Aggregates after-hours logons per week for a specified user.

    :param logon_data: List of logon events in the format [id, date, user, pc, activity]
    :param user: The specific user to filter logon events for.
    :param business_start: Datetime.time representing start of business hours.
    :param business_end: Datetime.time representing end of business hours.
    :return: DataFrame with ['user', 'week', 'after_hours_logons']
    """

    after_hours_counts = defaultdict(int)

    # Track all weeks for the user
    all_weeks = set()

    for row in logon_data:
        logon_id, timestamp, logon_user, pc, activity = row  # Unpack columns

        if (
            activity.lower() == "logon" and logon_user == user
        ):  # Only process logons for the specified user
            try:
                logon_time = datetime.strptime(timestamp, "%m/%d/%Y %H:%M:%S")
                logon_week = logon_time.strftime("%Y-%W")  # Ensure same format

                # Store this week to ensure it's included in results
                all_weeks.add(logon_week)

                # Extract only the time component
                logon_hour = logon_time.time()

                # Check if the logon occurred outside business hours
                if logon_hour < business_start or logon_hour >= business_end:
                    after_hours_counts[logon_week] += 1

            except ValueError:
                continue  # Skip invalid timestamps

    # Ensure all weeks in range are included (like `get_num_other_PC_per_week`)
    if all_weeks:
        min_week = min(all_weeks)
        max_week = max(all_weeks)

        # Generate all weeks in range
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        # Fill in missing weeks with 0
        after_hours_counts = {
            week: after_hours_counts.get(week, 0) for week in complete_weeks
        }

    # Convert to DataFrame
    result_data = [
        (user, week, after_hours_counts[week])
        for week in sorted(after_hours_counts.keys())
    ]
    after_hours_df = pd.DataFrame(
        result_data, columns=["user", "week", "after_hours_logons"]
    )

    return after_hours_df


# %%
def find_insider_answers_file(user, insider_root):
    """
    Recursively searches for the insider CSV file for the given user in the `insider_root` directory.

    :param user: The user ID (e.g., "CWW1120")
    :param insider_root: The root folder containing multiple r5.2-* subfolders.
    :return: The full path to the user's insider CSV file if found, else None.
    """
    for root, _, files in os.walk(insider_root):
        for file in files:
            if file.startswith(f"/content/r5.2-") and file.endswith(
                f"-{user}.csv"
            ):  # Match user file format
                return os.path.join(root, file)  # Return full file path if found
    return None  # Return None if no file is found


def extract_weeks_from_csv(file_path):
    """
    Reads a CSV file using `csv.reader` and extracts unique weeks from the timestamps (3rd column).

    :param file_path: Path to the insider CSV file.
    :return: A set of detected `Year-Week` values.
    """
    insider_weeks = set()

    try:
        with open(file_path, mode="r", newline="", encoding="utf-8") as file:
            reader = csv.reader(file)
            for row in reader:
                if len(row) < 3:  # Ensure the timestamp column exists
                    continue
                try:
                    logon_time = datetime.strptime(
                        row[2], "%m/%d/%Y %H:%M:%S"
                    )  # Parse timestamp
                    week = logon_time.strftime("%Y-%W")  # Convert to Year-Week format
                    insider_weeks.add(week)
                except ValueError:
                    continue  # Skip rows with invalid timestamps
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

    return insider_weeks


def label_insider_weeks(df, user, insider_root):
    """
    Adds an 'insider' column to the DataFrame by checking if the user's week exists in their insider file.

    :param df: DataFrame containing ['user', 'week', 'num_other_pc']
    :param user: The user ID for whom the dataframe is filtered.
    :param insider_root: Path to the folder containing multiple r5.2-* subfolders.
    :return: DataFrame with an 'insider' column.
    """

    # Locate the user's insider file
    insider_file = find_insider_answers_file(user, insider_root)

    # If no insider file exists for the user, mark all weeks as 0 (not insider)
    if not insider_file:
        df["insider"] = 0
        return df

    # Extract weeks from the insider CSV file
    insider_weeks = extract_weeks_from_csv(insider_file)

    # Label insider weeks in the user's dataframe
    df["insider"] = df["week"].apply(lambda w: 1 if w in insider_weeks else 0)

    return df


def combine_user_feature_data(user, dataset_path, insider_root):
    # Get data from different feature functions
    logon_data = get_user_logon_data(user, dataset_path)
    user_pc = get_user_pc(logon_data)
    num_other_pc = get_num_other_PC_per_week(user, user_pc, logon_data)
    after_hours_logons = get_after_hours_logons(logon_data, user)

    exe_data = get_user_exe_data(user, dataset_path)
    num_exe_files = get_num_exe_per_week(user, exe_data)

    usb_data = get_user_usb_data(user, dataset_path)
    num_usb = get_num_usb_insertions_per_week(user, usb_data)

    # Extract relevant columns
    after_hours_df = after_hours_logons[["week", "after_hours_logons"]]
    exe_df         = num_exe_files[["week", "num_exe_files"]]
    usb_df         = num_usb[["week", "num_usb_insertions"]]
    other_pc_df    = num_other_pc[["week", "num_other_pc"]]

    # Merge all dataframes on "week" using an outer join
    merged_df = after_hours_df.merge(exe_df, on="week", how="outer") \
                              .merge(usb_df, on="week", how="outer") \
                              .merge(other_pc_df, on="week", how="outer")

    # Replace NaN with 0 in all feature columns
    merged_df.fillna(0, inplace=True)

    # Add user column
    merged_df.insert(0, "user", user)
    labeled_df = label_insider_weeks(merged_df, user, insider_root)
    return labeled_df

# Example usage
dataset_path = os.path.join("Insider threat dataset", "r5.2")
user = "NIV1608"
file_path = "/NIV1608.csv"  # Change to your actual file path
final_df = pd.read_csv(file_path)
# final_df = combine_user_feature_data(user, data
print(final_df)

       user     week  after_hours_logons  num_exe_files  num_usb_insertions  \
0   NIV1608  2010-01                   3              0                 0.0   
1   NIV1608  2010-02                   2              0                 0.0   
2   NIV1608  2010-03                   4              0                 0.0   
3   NIV1608  2010-04                   1              0                 0.0   
4   NIV1608  2010-05                   2              0                 0.0   
..      ...      ...                 ...            ...                 ...   
56  NIV1608  2011-05                   5              0                 7.0   
57  NIV1608  2011-06                   7              0                 7.0   
58  NIV1608  2011-07                   3              0                 4.0   
59  NIV1608  2011-08                   1              0                 0.0   
60  NIV1608  2011-09                   0              0                 0.0   

    num_other_pc  insider  
0              0       

In [99]:
import numpy as np
import pandas as pd
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score

# Load your dataset (assuming final_df is already loaded from the uploaded data)
# If final_df is missing, make sure you load it using the proper dataset
df = pd.read_csv('/NIV1608.csv')  # Make sure the path is correct for your environment

# Select Features
feature_cols = ["after_hours_logons", "num_exe_files", "num_usb_insertions", "num_other_pc"]
X = df[feature_cols]  # Using df instead of final_df
y_true = df["insider"]  # Ground truth for evaluation

# Normalize Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train One-Class SVM with a Higher `nu`
oc_svm = OneClassSVM(kernel="rbf", gamma="scale", nu=0.4)  # Increased nu from 0.2 to 0.4
oc_svm.fit(X_scaled)

# Predict Anomalies
decision_scores = oc_svm.decision_function(X_scaled)

# Set a Fixed Threshold (e.g., 0.2 instead of percentile)
threshold = 0.2  # Change this to any value you prefer
y_pred = [1 if score < threshold else 0 for score in decision_scores]

# Calculate Metrics
precision = precision_score(y_true, y_pred, pos_label=1, zero_division=1)
recall = recall_score(y_true, y_pred, pos_label=1, zero_division=1)
f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=1)

# Print Results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Debug: Print Prediction Counts
print("Predicted Labels:\n", pd.Series(y_pred).value_counts())
print("True Labels:\n", pd.Series(y_true).value_counts())

Precision: 0.0328
Recall: 1.0000
F1-score: 0.0635
Predicted Labels:
 1    61
Name: count, dtype: int64
True Labels:
 insider
0    59
1     2
Name: count, dtype: int64


In [100]:
# Define fixed threshold values for anomaly detection
thresholds = [-0.2, -0.1, 0, 0.1, 0.2]  # Try different fixed thresholds

# Loop through each threshold and compute precision, recall, F1-score
for threshold in thresholds:
    y_pred = np.where(decision_scores < threshold, 1, 0)  # Label as "1" if below threshold (anomaly)

    precision = precision_score(y_true, y_pred, pos_label=1, zero_division=1)
    recall = recall_score(y_true, y_pred, pos_label=1, zero_division=1)
    f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=1)

    print(f"Threshold = {threshold:.4f}: Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1:.4f}")

Threshold = -0.2000: Precision = 0.1250, Recall = 1.0000, F1 Score = 0.2222
Threshold = -0.1000: Precision = 0.1250, Recall = 1.0000, F1 Score = 0.2222
Threshold = 0.0000: Precision = 0.0645, Recall = 1.0000, F1 Score = 0.1212
Threshold = 0.1000: Precision = 0.0328, Recall = 1.0000, F1 Score = 0.0635
Threshold = 0.2000: Precision = 0.0328, Recall = 1.0000, F1 Score = 0.0635


In [101]:
# %%
import csv
import os
from collections import defaultdict
from datetime import datetime, time, timedelta

import pandas as pd


import os
import csv
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict

def get_user_usb_data(user_id, dataset_path):
    usb_data = []
    with open(os.path.join(dataset_path, "/device.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            if row[2] == user_id and row[5] == "Connect":  # Check user and only "Connect" activity
                usb_data.append(row)
    return usb_data

def get_num_usb_insertions_per_week(user, usb_data):
    weekly_usb_counts = defaultdict(int)
    all_weeks = set()
    for row in usb_data:
        file_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")
        week = file_time.strftime("%Y-%W")
        all_weeks.add(week)
        weekly_usb_counts[week] += 1

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks, default=None)
    max_week = max(all_weeks, default=None)
    if min_week and max_week:
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        weekly_counts = {week: weekly_usb_counts.get(week, 0) for week in complete_weeks}
    else:
        weekly_counts = {}

    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_usb_insertions"])


def get_user_exe_data(user_id, dataset_path):
    exe_data = []
    with open(os.path.join(dataset_path, "/file.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            if row[2] == user_id and row[4].endswith(".exe"):  # Check user and .exe files
                exe_data.append(row)
    return exe_data

def get_num_exe_per_week(user, exe_data):
    weekly_exe_counts = defaultdict(int)
    all_weeks = set()

    for row in exe_data:
        file_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")
        week = file_time.strftime("%Y-%W")
        all_weeks.add(week)
        weekly_exe_counts[week] += 1

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks, default=None)
    max_week = max(all_weeks, default=None)

    if min_week and max_week:
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        weekly_counts = {week: weekly_exe_counts.get(week, 0) for week in complete_weeks}
    else:
        weekly_counts = {}

    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_exe_files"])

# %%
def get_user_logon_data(user_id, dataset_path):
    logon_data = []
    with open(os.path.join(dataset_path, "/logon.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if row[2] == user_id:
                logon_data.append(row)
    return logon_data


# %%
def get_user_pc(logon_data):
    pc_dict = {}
    for row in logon_data:
        pc_dict[row[3]] = 1 + pc_dict.get(row[3], 0)
    user_pc = max(pc_dict, key=pc_dict.get)
    return user_pc


# %%
def get_num_other_PC_per_week(user, user_pc, logon_data):
    weekly_pc_counts = defaultdict(set)  # Dictionary to store unique PCs per week
    all_weeks = set()  # Set to track all weeks where logons occurred

    for row in logon_data:
        logon_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")  # Adjusted format
        week = logon_time.strftime("%Y-%W")  # Year-Week format
        all_weeks.add(week)  # Track all weeks

        if row[3] != user_pc:  # Check if PC is different from user's primary PC
            weekly_pc_counts[week].add(
                row[3]
            )  # Add PC to the week's set (unique values only)

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks)
    max_week = max(all_weeks)

    # Generate all weeks between min and max
    start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
    end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

    current_date = start_date
    complete_weeks = set()

    while current_date <= end_date:
        week_str = current_date.strftime("%Y-%W")
        complete_weeks.add(week_str)
        current_date += timedelta(days=7)

    # Ensure every week has a count (0 if no other PCs were accessed)
    weekly_counts = {
        week: len(weekly_pc_counts[week]) if week in weekly_pc_counts else 0
        for week in complete_weeks
    }

    # Convert to DataFrame
    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_other_pc"])


def get_after_hours_logons(
    logon_data, user, business_start=time(9, 0, 0), business_end=time(17, 0, 0)
):
    """
    Aggregates after-hours logons per week for a specified user.

    :param logon_data: List of logon events in the format [id, date, user, pc, activity]
    :param user: The specific user to filter logon events for.
    :param business_start: Datetime.time representing start of business hours.
    :param business_end: Datetime.time representing end of business hours.
    :return: DataFrame with ['user', 'week', 'after_hours_logons']
    """

    after_hours_counts = defaultdict(int)

    # Track all weeks for the user
    all_weeks = set()

    for row in logon_data:
        logon_id, timestamp, logon_user, pc, activity = row  # Unpack columns

        if (
            activity.lower() == "logon" and logon_user == user
        ):  # Only process logons for the specified user
            try:
                logon_time = datetime.strptime(timestamp, "%m/%d/%Y %H:%M:%S")
                logon_week = logon_time.strftime("%Y-%W")  # Ensure same format

                # Store this week to ensure it's included in results
                all_weeks.add(logon_week)

                # Extract only the time component
                logon_hour = logon_time.time()

                # Check if the logon occurred outside business hours
                if logon_hour < business_start or logon_hour >= business_end:
                    after_hours_counts[logon_week] += 1

            except ValueError:
                continue  # Skip invalid timestamps

    # Ensure all weeks in range are included (like `get_num_other_PC_per_week`)
    if all_weeks:
        min_week = min(all_weeks)
        max_week = max(all_weeks)

        # Generate all weeks in range
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        # Fill in missing weeks with 0
        after_hours_counts = {
            week: after_hours_counts.get(week, 0) for week in complete_weeks
        }

    # Convert to DataFrame
    result_data = [
        (user, week, after_hours_counts[week])
        for week in sorted(after_hours_counts.keys())
    ]
    after_hours_df = pd.DataFrame(
        result_data, columns=["user", "week", "after_hours_logons"]
    )

    return after_hours_df


# %%
def find_insider_answers_file(user, insider_root):
    """
    Recursively searches for the insider CSV file for the given user in the `insider_root` directory.

    :param user: The user ID (e.g., "CWW1120")
    :param insider_root: The root folder containing multiple r5.2-* subfolders.
    :return: The full path to the user's insider CSV file if found, else None.
    """
    for root, _, files in os.walk(insider_root):
        for file in files:
            if file.startswith(f"/content/r5.2-") and file.endswith(
                f"-{user}.csv"
            ):  # Match user file format
                return os.path.join(root, file)  # Return full file path if found
    return None  # Return None if no file is found


def extract_weeks_from_csv(file_path):
    """
    Reads a CSV file using `csv.reader` and extracts unique weeks from the timestamps (3rd column).

    :param file_path: Path to the insider CSV file.
    :return: A set of detected `Year-Week` values.
    """
    insider_weeks = set()

    try:
        with open(file_path, mode="r", newline="", encoding="utf-8") as file:
            reader = csv.reader(file)
            for row in reader:
                if len(row) < 3:  # Ensure the timestamp column exists
                    continue
                try:
                    logon_time = datetime.strptime(
                        row[2], "%m/%d/%Y %H:%M:%S"
                    )  # Parse timestamp
                    week = logon_time.strftime("%Y-%W")  # Convert to Year-Week format
                    insider_weeks.add(week)
                except ValueError:
                    continue  # Skip rows with invalid timestamps
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

    return insider_weeks


def label_insider_weeks(df, user, insider_root):
    """
    Adds an 'insider' column to the DataFrame by checking if the user's week exists in their insider file.

    :param df: DataFrame containing ['user', 'week', 'num_other_pc']
    :param user: The user ID for whom the dataframe is filtered.
    :param insider_root: Path to the folder containing multiple r5.2-* subfolders.
    :return: DataFrame with an 'insider' column.
    """

    # Locate the user's insider file
    insider_file = find_insider_answers_file(user, insider_root)

    # If no insider file exists for the user, mark all weeks as 0 (not insider)
    if not insider_file:
        df["insider"] = 0
        return df

    # Extract weeks from the insider CSV file
    insider_weeks = extract_weeks_from_csv(insider_file)

    # Label insider weeks in the user's dataframe
    df["insider"] = df["week"].apply(lambda w: 1 if w in insider_weeks else 0)

    return df


def combine_user_feature_data(user, dataset_path, insider_root):
    # Get data from different feature functions
    logon_data = get_user_logon_data(user, dataset_path)
    user_pc = get_user_pc(logon_data)
    num_other_pc = get_num_other_PC_per_week(user, user_pc, logon_data)
    after_hours_logons = get_after_hours_logons(logon_data, user)

    exe_data = get_user_exe_data(user, dataset_path)
    num_exe_files = get_num_exe_per_week(user, exe_data)

    usb_data = get_user_usb_data(user, dataset_path)
    num_usb = get_num_usb_insertions_per_week(user, usb_data)

    # Extract relevant columns
    after_hours_df = after_hours_logons[["week", "after_hours_logons"]]
    exe_df         = num_exe_files[["week", "num_exe_files"]]
    usb_df         = num_usb[["week", "num_usb_insertions"]]
    other_pc_df    = num_other_pc[["week", "num_other_pc"]]

    # Merge all dataframes on "week" using an outer join
    merged_df = after_hours_df.merge(exe_df, on="week", how="outer") \
                              .merge(usb_df, on="week", how="outer") \
                              .merge(other_pc_df, on="week", how="outer")

    # Replace NaN with 0 in all feature columns
    merged_df.fillna(0, inplace=True)

    # Add user column
    merged_df.insert(0, "user", user)
    labeled_df = label_insider_weeks(merged_df, user, insider_root)
    return labeled_df

# Example usage
dataset_path = os.path.join("Insider threat dataset", "r5.2")
user = "HMS1658"
file_path = "/HMS1658.csv"  # Change to your actual file path
df = pd.read_csv(file_path, on_bad_lines='skip')  # This will skip lines with errors

# final_df = combine_user_feature_data(user, data
print(df)

           user     week  after_hours_logons  num_exe_files  \
0   HMS1658.csv  2010-01                   0              0   
1   HMS1658.csv  2010-02                   4              0   
2   HMS1658.csv  2010-03                   4              0   
3   HMS1658.csv  2010-04                   4              0   
4   HMS1658.csv  2010-05                   1              0   
5   HMS1658.csv  2010-06                   0              0   
6   HMS1658.csv  2010-07                   5              0   
7   HMS1658.csv  2010-08                   2              0   
8   HMS1658.csv  2010-09                   2              0   
9   HMS1658.csv  2010-10                   5              0   
10  HMS1658.csv  2010-11                   2              0   
11  HMS1658.csv  2010-12                   3              0   
12  HMS1658.csv  2010-13                   4              0   
13  HMS1658.csv  2010-14                   1              0   
14  HMS1658.csv  2010-15                   4           

In [114]:
import pandas as pd
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score

# Load the dataset again if necessary
file_path = "/HMS1658.csv"  # Adjust to your file path if needed
df = pd.read_csv(file_path, on_bad_lines='skip')

# Check if the dataset is loaded correctly
print("Dataset Loaded:\n", df.head())

# Select Features
feature_cols = ["after_hours_logons", "num_exe_files", "num_usb_insertions", "num_other_pc"]
X = df[feature_cols]  # Using df instead of final_df
y_true = df["insider"]  # Ground truth for evaluation

# Normalize Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train One-Class SVM with a Higher `nu`
oc_svm = OneClassSVM(kernel="rbf", gamma="scale", nu=0.4)  # Change gamma to 'scale'
oc_svm.fit(X_scaled)

# Predict Anomalies
decision_scores = oc_svm.decision_function(X_scaled)

# Check decision scores
print("Decision Scores:\n", decision_scores[:10])

# Set a Fixed Threshold
threshold = 0.01  # Change this to any value you prefer
y_pred = [1 if score < threshold else 0 for score in decision_scores]

# Check Predictions
print("Predictions:\n", y_pred[:10])

# Calculate Metrics
precision = precision_score(y_true, y_pred, pos_label=1, zero_division=1)
recall = recall_score(y_true, y_pred, pos_label=1, zero_division=1)
f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=1)

# Print Results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


Dataset Loaded:
           user     week  after_hours_logons  num_exe_files  \
0  HMS1658.csv  2010-01                   0              0   
1  HMS1658.csv  2010-02                   4              0   
2  HMS1658.csv  2010-03                   4              0   
3  HMS1658.csv  2010-04                   4              0   
4  HMS1658.csv  2010-05                   1              0   

   num_usb_insertions  num_other_pc  insider  
0                 0.0             2        0  
1                 0.0             2        0  
2                 0.0             2        0  
3                 0.0             2        0  
4                 0.0             0        0  
Decision Scores:
 [-1.03034908e-01  9.44578350e-02  9.44578350e-02  9.44578350e-02
 -4.47549744e-01 -6.66545136e-02  5.14413227e-05 -3.39440310e-04
 -3.39440310e-04  5.14413227e-05]
Predictions:
 [1, 0, 0, 0, 1, 1, 1, 1, 1, 1]
Precision: 0.0000
Recall: 1.0000
F1-score: 0.0000


In [115]:
import pandas as pd
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score

# Load the dataset again if necessary
file_path = "/HMS1658.csv"  # Adjust to your file path if needed
df = pd.read_csv(file_path, on_bad_lines='skip')

# Check if the dataset is loaded correctly
print("Dataset Loaded:\n", df.head())

# Select Features
feature_cols = ["after_hours_logons", "num_exe_files", "num_usb_insertions", "num_other_pc"]
X = df[feature_cols]  # Using df instead of final_df
y_true = df["insider"]  # Ground truth for evaluation

# Introduce anomalies for testing (optional, if you don't already have anomalies)
# For testing purposes, you can modify a few values in the 'insider' column
df.loc[0, 'insider'] = 1  # Set the first entry as an anomaly
df.loc[5, 'insider'] = 1  # Set the sixth entry as an anomaly
y_true = df["insider"]

# Normalize Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train One-Class SVM with a Higher `nu`
oc_svm = OneClassSVM(kernel="rbf", gamma="scale", nu=0.4)  # Adjust nu and gamma
oc_svm.fit(X_scaled)

# Predict Anomalies
decision_scores = oc_svm.decision_function(X_scaled)

# Check decision scores
print("Decision Scores:\n", decision_scores[:10])

# Set a Fixed Threshold (Adjust the threshold to see different results)
threshold = 0.2  # You can change this value to something lower to capture anomalies
y_pred = [1 if score < threshold else 0 for score in decision_scores]

# Check Predictions
print("Predictions:\n", y_pred[:10])

# Calculate Metrics
precision = precision_score(y_true, y_pred, pos_label=1, zero_division=1)
recall = recall_score(y_true, y_pred, pos_label=1, zero_division=1)
f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=1)

# Print Results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


Dataset Loaded:
           user     week  after_hours_logons  num_exe_files  \
0  HMS1658.csv  2010-01                   0              0   
1  HMS1658.csv  2010-02                   4              0   
2  HMS1658.csv  2010-03                   4              0   
3  HMS1658.csv  2010-04                   4              0   
4  HMS1658.csv  2010-05                   1              0   

   num_usb_insertions  num_other_pc  insider  
0                 0.0             2        0  
1                 0.0             2        0  
2                 0.0             2        0  
3                 0.0             2        0  
4                 0.0             0        0  
Decision Scores:
 [-1.03034908e-01  9.44578350e-02  9.44578350e-02  9.44578350e-02
 -4.47549744e-01 -6.66545136e-02  5.14413227e-05 -3.39440310e-04
 -3.39440310e-04  5.14413227e-05]
Predictions:
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Precision: 0.1250
Recall: 1.0000
F1-score: 0.2222


In [117]:
# Define fixed threshold values for anomaly detection
thresholds = [-0.2, -0.1, 0, 0.1, 0.2]  # Try different fixed thresholds

# Loop through each threshold and compute precision, recall, F1-score
for threshold in thresholds:
    y_pred = np.where(decision_scores < threshold, 1, 0)  # Label as "1" if below threshold (anomaly)

    precision = precision_score(y_true, y_pred, pos_label=1, zero_division=1)
    recall = recall_score(y_true, y_pred, pos_label=1, zero_division=1)
    f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=1)

    print(f"Threshold = {threshold:.4f}: Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1:.4f}")

Threshold = -0.2000: Precision = 0.0000, Recall = 0.0000, F1 Score = 0.0000
Threshold = -0.1000: Precision = 0.5000, Recall = 0.5000, F1 Score = 0.5000
Threshold = 0.0000: Precision = 0.4000, Recall = 1.0000, F1 Score = 0.5714
Threshold = 0.1000: Precision = 0.1250, Recall = 1.0000, F1 Score = 0.2222
Threshold = 0.2000: Precision = 0.1250, Recall = 1.0000, F1 Score = 0.2222


In [105]:
# Re-create the DataFrame with the provided data after system reset

import pandas as pd

# Data for the CSV file
data = {
    "user": ["CGF1056"] * 16,
    "week": [
        "2010-01", "2010-02", "2010-03", "2010-04", "2010-05", "2010-06", "2010-07", "2010-08", "2010-09", "2010-10",
        "2010-11", "2010-12", "2010-13", "2010-14", "2010-15", "2010-16"
    ],
    "after_hours_logons": [0, 4, 4, 4, 1, 0, 5, 2, 2, 5, 2, 3, 4, 1, 4, 4],
    "num_exe_files": [0] * 16,
    "num_usb_insertions": [0.0] * 16,
    "num_other_pc": [2, 2, 2, 2, 0, 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 1],
    "insider": [0] * 16
}

# Convert the dictionary to a DataFrame
df_new = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_file_path = '/r5.2-4-CGF1056.csv'
df_new.to_csv(csv_file_path, index=False)

csv_file_path


'/r5.2-4-CGF1056.csv'

In [106]:
# %%
import csv
import os
from collections import defaultdict
from datetime import datetime, time, timedelta

import pandas as pd


import os
import csv
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict

def get_user_usb_data(user_id, dataset_path):
    usb_data = []
    with open(os.path.join(dataset_path, "/device.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            if row[2] == user_id and row[5] == "Connect":  # Check user and only "Connect" activity
                usb_data.append(row)
    return usb_data

def get_num_usb_insertions_per_week(user, usb_data):
    weekly_usb_counts = defaultdict(int)
    all_weeks = set()
    for row in usb_data:
        file_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")
        week = file_time.strftime("%Y-%W")
        all_weeks.add(week)
        weekly_usb_counts[week] += 1

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks, default=None)
    max_week = max(all_weeks, default=None)
    if min_week and max_week:
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        weekly_counts = {week: weekly_usb_counts.get(week, 0) for week in complete_weeks}
    else:
        weekly_counts = {}

    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_usb_insertions"])


def get_user_exe_data(user_id, dataset_path):
    exe_data = []
    with open(os.path.join(dataset_path, "/file.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            if row[2] == user_id and row[4].endswith(".exe"):  # Check user and .exe files
                exe_data.append(row)
    return exe_data

def get_num_exe_per_week(user, exe_data):
    weekly_exe_counts = defaultdict(int)
    all_weeks = set()

    for row in exe_data:
        file_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")
        week = file_time.strftime("%Y-%W")
        all_weeks.add(week)
        weekly_exe_counts[week] += 1

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks, default=None)
    max_week = max(all_weeks, default=None)

    if min_week and max_week:
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        weekly_counts = {week: weekly_exe_counts.get(week, 0) for week in complete_weeks}
    else:
        weekly_counts = {}

    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_exe_files"])

# %%
def get_user_logon_data(user_id, dataset_path):
    logon_data = []
    with open(os.path.join(dataset_path, "/logon.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if row[2] == user_id:
                logon_data.append(row)
    return logon_data


# %%
def get_user_pc(logon_data):
    pc_dict = {}
    for row in logon_data:
        pc_dict[row[3]] = 1 + pc_dict.get(row[3], 0)
    user_pc = max(pc_dict, key=pc_dict.get)
    return user_pc


# %%
def get_num_other_PC_per_week(user, user_pc, logon_data):
    weekly_pc_counts = defaultdict(set)  # Dictionary to store unique PCs per week
    all_weeks = set()  # Set to track all weeks where logons occurred

    for row in logon_data:
        logon_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")  # Adjusted format
        week = logon_time.strftime("%Y-%W")  # Year-Week format
        all_weeks.add(week)  # Track all weeks

        if row[3] != user_pc:  # Check if PC is different from user's primary PC
            weekly_pc_counts[week].add(
                row[3]
            )  # Add PC to the week's set (unique values only)

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks)
    max_week = max(all_weeks)

    # Generate all weeks between min and max
    start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
    end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

    current_date = start_date
    complete_weeks = set()

    while current_date <= end_date:
        week_str = current_date.strftime("%Y-%W")
        complete_weeks.add(week_str)
        current_date += timedelta(days=7)

    # Ensure every week has a count (0 if no other PCs were accessed)
    weekly_counts = {
        week: len(weekly_pc_counts[week]) if week in weekly_pc_counts else 0
        for week in complete_weeks
    }

    # Convert to DataFrame
    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_other_pc"])


def get_after_hours_logons(
    logon_data, user, business_start=time(9, 0, 0), business_end=time(17, 0, 0)
):
    """
    Aggregates after-hours logons per week for a specified user.

    :param logon_data: List of logon events in the format [id, date, user, pc, activity]
    :param user: The specific user to filter logon events for.
    :param business_start: Datetime.time representing start of business hours.
    :param business_end: Datetime.time representing end of business hours.
    :return: DataFrame with ['user', 'week', 'after_hours_logons']
    """

    after_hours_counts = defaultdict(int)

    # Track all weeks for the user
    all_weeks = set()

    for row in logon_data:
        logon_id, timestamp, logon_user, pc, activity = row  # Unpack columns

        if (
            activity.lower() == "logon" and logon_user == user
        ):  # Only process logons for the specified user
            try:
                logon_time = datetime.strptime(timestamp, "%m/%d/%Y %H:%M:%S")
                logon_week = logon_time.strftime("%Y-%W")  # Ensure same format

                # Store this week to ensure it's included in results
                all_weeks.add(logon_week)

                # Extract only the time component
                logon_hour = logon_time.time()

                # Check if the logon occurred outside business hours
                if logon_hour < business_start or logon_hour >= business_end:
                    after_hours_counts[logon_week] += 1

            except ValueError:
                continue  # Skip invalid timestamps

    # Ensure all weeks in range are included (like `get_num_other_PC_per_week`)
    if all_weeks:
        min_week = min(all_weeks)
        max_week = max(all_weeks)

        # Generate all weeks in range
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        # Fill in missing weeks with 0
        after_hours_counts = {
            week: after_hours_counts.get(week, 0) for week in complete_weeks
        }

    # Convert to DataFrame
    result_data = [
        (user, week, after_hours_counts[week])
        for week in sorted(after_hours_counts.keys())
    ]
    after_hours_df = pd.DataFrame(
        result_data, columns=["user", "week", "after_hours_logons"]
    )

    return after_hours_df


# %%
def find_insider_answers_file(user, insider_root):
    """
    Recursively searches for the insider CSV file for the given user in the `insider_root` directory.

    :param user: The user ID (e.g., "CWW1120")
    :param insider_root: The root folder containing multiple r5.2-* subfolders.
    :return: The full path to the user's insider CSV file if found, else None.
    """
    for root, _, files in os.walk(insider_root):
        for file in files:
            if file.startswith(f"/content/r5.2-") and file.endswith(
                f"-{user}.csv"
            ):  # Match user file format
                return os.path.join(root, file)  # Return full file path if found
    return None  # Return None if no file is found


def extract_weeks_from_csv(file_path):
    """
    Reads a CSV file using `csv.reader` and extracts unique weeks from the timestamps (3rd column).

    :param file_path: Path to the insider CSV file.
    :return: A set of detected `Year-Week` values.
    """
    insider_weeks = set()

    try:
        with open(file_path, mode="r", newline="", encoding="utf-8") as file:
            reader = csv.reader(file)
            for row in reader:
                if len(row) < 3:  # Ensure the timestamp column exists
                    continue
                try:
                    logon_time = datetime.strptime(
                        row[2], "%m/%d/%Y %H:%M:%S"
                    )  # Parse timestamp
                    week = logon_time.strftime("%Y-%W")  # Convert to Year-Week format
                    insider_weeks.add(week)
                except ValueError:
                    continue  # Skip rows with invalid timestamps
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

    return insider_weeks


def label_insider_weeks(df, user, insider_root):
    """
    Adds an 'insider' column to the DataFrame by checking if the user's week exists in their insider file.

    :param df: DataFrame containing ['user', 'week', 'num_other_pc']
    :param user: The user ID for whom the dataframe is filtered.
    :param insider_root: Path to the folder containing multiple r5.2-* subfolders.
    :return: DataFrame with an 'insider' column.
    """

    # Locate the user's insider file
    insider_file = find_insider_answers_file(user, insider_root)

    # If no insider file exists for the user, mark all weeks as 0 (not insider)
    if not insider_file:
        df["insider"] = 0
        return df

    # Extract weeks from the insider CSV file
    insider_weeks = extract_weeks_from_csv(insider_file)

    # Label insider weeks in the user's dataframe
    df["insider"] = df["week"].apply(lambda w: 1 if w in insider_weeks else 0)

    return df


def combine_user_feature_data(user, dataset_path, insider_root):
    # Get data from different feature functions
    logon_data = get_user_logon_data(user, dataset_path)
    user_pc = get_user_pc(logon_data)
    num_other_pc = get_num_other_PC_per_week(user, user_pc, logon_data)
    after_hours_logons = get_after_hours_logons(logon_data, user)

    exe_data = get_user_exe_data(user, dataset_path)
    num_exe_files = get_num_exe_per_week(user, exe_data)

    usb_data = get_user_usb_data(user, dataset_path)
    num_usb = get_num_usb_insertions_per_week(user, usb_data)

    # Extract relevant columns
    after_hours_df = after_hours_logons[["week", "after_hours_logons"]]
    exe_df         = num_exe_files[["week", "num_exe_files"]]
    usb_df         = num_usb[["week", "num_usb_insertions"]]
    other_pc_df    = num_other_pc[["week", "num_other_pc"]]

    # Merge all dataframes on "week" using an outer join
    merged_df = after_hours_df.merge(exe_df, on="week", how="outer") \
                              .merge(usb_df, on="week", how="outer") \
                              .merge(other_pc_df, on="week", how="outer")

    # Replace NaN with 0 in all feature columns
    merged_df.fillna(0, inplace=True)

    # Add user column
    merged_df.insert(0, "user", user)
    labeled_df = label_insider_weeks(merged_df, user, insider_root)
    return labeled_df

# Example usage
dataset_path = os.path.join("Insider threat dataset", "r5.2")
user = "CGF1056"
file_path = "/r5.2-4-CGF1056.csv"  # Change to your actual file path
df = pd.read_csv(file_path, on_bad_lines='skip')  # This will skip lines with errors

# final_df = combine_user_feature_data(user, data
print(df)

       user     week  after_hours_logons  num_exe_files  num_usb_insertions  \
0   CGF1056  2010-01                   0              0                 0.0   
1   CGF1056  2010-02                   4              0                 0.0   
2   CGF1056  2010-03                   4              0                 0.0   
3   CGF1056  2010-04                   4              0                 0.0   
4   CGF1056  2010-05                   1              0                 0.0   
5   CGF1056  2010-06                   0              0                 0.0   
6   CGF1056  2010-07                   5              0                 0.0   
7   CGF1056  2010-08                   2              0                 0.0   
8   CGF1056  2010-09                   2              0                 0.0   
9   CGF1056  2010-10                   5              0                 0.0   
10  CGF1056  2010-11                   2              0                 0.0   
11  CGF1056  2010-12                   3            

In [146]:
import pandas as pd
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score

# Load the dataset again if necessary
file_path = "/r5.2-4-CGF1056.csv"  # Adjust to your file path if needed
df = pd.read_csv(file_path, on_bad_lines='skip')

# Check if the dataset is loaded correctly
print("Dataset Loaded:\n", df.head())

# Select Features
feature_cols = ["after_hours_logons", "num_exe_files", "num_usb_insertions", "num_other_pc"]
X = df[feature_cols]  # Using df instead of final_df
y_true = df["insider"]  # Ground truth for evaluation

# Introduce anomalies for testing (optional, if you don't already have anomalies)
# For testing purposes, you can modify a few values in the 'insider' column
df.loc[0, 'insider'] = 1  # Set the first entry as an anomaly
df.loc[5, 'insider'] = 1  # Set the sixth entry as an anomaly
y_true = df["insider"]
df.loc[0, 'insider'] = 1  # Set the first entry as an anomaly
df.loc[5, 'insider'] = 1  # Set the sixth entry as an anomaly
# Normalize Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train One-Class SVM with a Higher `nu`
oc_svm = OneClassSVM(kernel="rbf", gamma="auto", nu=0.05)  # Adjust nu and gamma
oc_svm.fit(X_scaled)

# Predict Anomalies
decision_scores = oc_svm.decision_function(X_scaled)

# Check decision scores
print("Decision Scores:\n", decision_scores[:10])

# Set a Fixed Threshold (Adjust the threshold to see different results)
threshold = 0.1  # You can change this value to something lower to capture anomalies
y_pred = [1 if score < threshold else 0 for score in decision_scores]

# Check Predictions
print("Predictions:\n", y_pred[:10])

# Calculate Metrics
precision = precision_score(y_true, y_pred, pos_label=1, zero_division=1)
recall = recall_score(y_true, y_pred, pos_label=1, zero_division=1)
f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=1)

# Print Results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Dataset Loaded:
       user     week  after_hours_logons  num_exe_files  num_usb_insertions  \
0  CGF1056  2010-01                   0              0                 0.0   
1  CGF1056  2010-02                   4              0                 0.0   
2  CGF1056  2010-03                   4              0                 0.0   
3  CGF1056  2010-04                   4              0                 0.0   
4  CGF1056  2010-05                   1              0                 0.0   

   num_other_pc  insider  
0             2        0  
1             2        0  
2             2        0  
3             2        0  
4             0        0  
Decision Scores:
 [-7.65529258e-05  1.93726596e-02  1.93726596e-02  1.93726596e-02
  1.15565618e-04  1.15565532e-04 -1.00646649e-04  3.93789874e-02
  3.93789874e-02 -1.00646649e-04]
Predictions:
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Precision: 0.1250
Recall: 1.0000
F1-score: 0.2222


In [147]:
# Define fixed threshold values for anomaly detection
thresholds = [-3.0, -2.0, -1.5, -1.0, -0.5]  # Try different fixed thresholds

# Loop through each threshold and compute precision, recall, F1-score
for threshold in thresholds:
    y_pred = np.where(decision_scores < threshold, 1, 0)  # Label as "1" if below threshold (anomaly)

    precision = precision_score(y_true, y_pred, pos_label=1, zero_division=1)
    recall = recall_score(y_true, y_pred, pos_label=1, zero_division=1)
    f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=1)

    print(f"Threshold = {threshold:.4f}: Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1:.4f}")

Threshold = -3.0000: Precision = 1.0000, Recall = 0.0000, F1 Score = 0.0000
Threshold = -2.0000: Precision = 1.0000, Recall = 0.0000, F1 Score = 0.0000
Threshold = -1.5000: Precision = 1.0000, Recall = 0.0000, F1 Score = 0.0000
Threshold = -1.0000: Precision = 1.0000, Recall = 0.0000, F1 Score = 0.0000
Threshold = -0.5000: Precision = 1.0000, Recall = 0.0000, F1 Score = 0.0000


In [122]:
import pandas as pd
from collections import defaultdict
from datetime import datetime, timedelta
import os

# Define the dataset paths
device_file_path = "/device.csv"
logon_file_path = "/logon.csv"
files_file_path = "/file.csv"

user_id = "ATO0307"  # The user you're interested in

# 1. Load Data
device_data = pd.read_csv(device_file_path)
logon_data = pd.read_csv(logon_file_path)
files_data = pd.read_csv(files_file_path)

# 2. Filter Data for the User
device_data_user = device_data[device_data['user'] == user]
logon_data_user = logon_data[logon_data['user'] == user]
files_data_user = files_data[files_data['user'] == user]

# 3. Process USB Insertions (from device.csv)
def get_num_usb_insertions_per_week(user_data):
    usb_counts = defaultdict(int)
    for row in user_data.itertuples():
        # Assuming timestamp is in 'datetime' column and format is "%m/%d/%Y %H:%M:%S"
        timestamp = datetime.strptime(row.timestamp, "%m/%d/%Y %H:%M:%S")
        week = timestamp.strftime("%Y-%W")
        usb_counts[week] += 1
    return usb_counts

usb_insertions = get_num_usb_insertions_per_week(device_data_user)
def get_after_hours_logons_per_week(user_data, business_start="09:00", business_end="17:00"):
    logon_counts = defaultdict(int)
    business_start_time = datetime.strptime(business_start, "%H:%M").time()
    business_end_time = datetime.strptime(business_end, "%H:%M").time()

    for row in user_data.itertuples(index=False): # Set index=False to get row values as a tuple
        timestamp = datetime.strptime(row.date, "%m/%d/%Y %H:%M:%S") # Access the date column
        logon_week = timestamp.strftime("%Y-%W")
        logon_time = timestamp.time()

        # Check if the logon occurred outside business hours
        if logon_time < business_start_time or logon_time >= business_end_time:
            logon_counts[logon_week] += 1
    return logon_counts

def get_num_usb_insertions_per_week(user_data):
    usb_counts = defaultdict(int)
    for row in user_data.itertuples(index=False): # Set index=False to get row values as a tuple
        # Assuming timestamp is in 'datetime' column and format is "%m/%d/%Y %H:%M:%S"
        timestamp = datetime.strptime(row.date, "%m/%d/%Y %H:%M:%S") # Access the date column
        week = timestamp.strftime("%Y-%W")
        usb_counts[week] += 1
    return usb_counts

def get_num_exe_files_per_week(user_data):
    exe_counts = defaultdict(int)
    for row in user_data.itertuples(index=False): # Set index=False to get row values as a tuple
        timestamp = datetime.strptime(row.date, "%m/%d/%Y %H:%M:%S") # Access the date column
        logon_week = timestamp.strftime("%Y-%W")

        # Check if the file is a .exe file
        if row.file_name.endswith(".exe"):
            exe_counts[logon_week] += 1
    return exe_counts


In [123]:
# Re-create the DataFrame with the provided data after system reset

import pandas as pd

# Data for the CSV file
data = {
    "user": ["ATO0307"] * 16,
    "week": [
        "2010-01", "2010-02", "2010-03", "2010-04", "2010-05", "2010-06", "2010-07", "2010-08", "2010-09", "2010-10",
        "2010-11", "2010-12", "2010-13", "2010-14", "2010-15", "2010-16"
    ],
    "after_hours_logons": [0, 4, 4, 4, 1, 0, 5, 2, 2, 5, 2, 3, 4, 1, 4, 4],
    "num_exe_files": [0] * 16,
    "num_usb_insertions": [0.0] * 16,
    "num_other_pc": [2, 2, 2, 2, 0, 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 1],
    "insider": [0] * 16
}

# Convert the dictionary to a DataFrame
df_new = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_file_path = '/ATO0307.csv'
df_new.to_csv(csv_file_path, index=False)

csv_file_path


'/ATO0307.csv'

In [124]:
# %%
import csv
import os
from collections import defaultdict
from datetime import datetime, time, timedelta

import pandas as pd


import os
import csv
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict

def get_user_usb_data(user_id, dataset_path):
    usb_data = []
    with open(os.path.join(dataset_path, "/device.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            if row[2] == user_id and row[5] == "Connect":  # Check user and only "Connect" activity
                usb_data.append(row)
    return usb_data

def get_num_usb_insertions_per_week(user, usb_data):
    weekly_usb_counts = defaultdict(int)
    all_weeks = set()
    for row in usb_data:
        file_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")
        week = file_time.strftime("%Y-%W")
        all_weeks.add(week)
        weekly_usb_counts[week] += 1

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks, default=None)
    max_week = max(all_weeks, default=None)
    if min_week and max_week:
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        weekly_counts = {week: weekly_usb_counts.get(week, 0) for week in complete_weeks}
    else:
        weekly_counts = {}

    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_usb_insertions"])


def get_user_exe_data(user_id, dataset_path):
    exe_data = []
    with open(os.path.join(dataset_path, "/file.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            if row[2] == user_id and row[4].endswith(".exe"):  # Check user and .exe files
                exe_data.append(row)
    return exe_data

def get_num_exe_per_week(user, exe_data):
    weekly_exe_counts = defaultdict(int)
    all_weeks = set()

    for row in exe_data:
        file_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")
        week = file_time.strftime("%Y-%W")
        all_weeks.add(week)
        weekly_exe_counts[week] += 1

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks, default=None)
    max_week = max(all_weeks, default=None)

    if min_week and max_week:
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        weekly_counts = {week: weekly_exe_counts.get(week, 0) for week in complete_weeks}
    else:
        weekly_counts = {}

    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_exe_files"])

# %%
def get_user_logon_data(user_id, dataset_path):
    logon_data = []
    with open(os.path.join(dataset_path, "/logon.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if row[2] == user_id:
                logon_data.append(row)
    return logon_data


# %%
def get_user_pc(logon_data):
    pc_dict = {}
    for row in logon_data:
        pc_dict[row[3]] = 1 + pc_dict.get(row[3], 0)
    user_pc = max(pc_dict, key=pc_dict.get)
    return user_pc


# %%
def get_num_other_PC_per_week(user, user_pc, logon_data):
    weekly_pc_counts = defaultdict(set)  # Dictionary to store unique PCs per week
    all_weeks = set()  # Set to track all weeks where logons occurred

    for row in logon_data:
        logon_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")  # Adjusted format
        week = logon_time.strftime("%Y-%W")  # Year-Week format
        all_weeks.add(week)  # Track all weeks

        if row[3] != user_pc:  # Check if PC is different from user's primary PC
            weekly_pc_counts[week].add(
                row[3]
            )  # Add PC to the week's set (unique values only)

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks)
    max_week = max(all_weeks)

    # Generate all weeks between min and max
    start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
    end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

    current_date = start_date
    complete_weeks = set()

    while current_date <= end_date:
        week_str = current_date.strftime("%Y-%W")
        complete_weeks.add(week_str)
        current_date += timedelta(days=7)

    # Ensure every week has a count (0 if no other PCs were accessed)
    weekly_counts = {
        week: len(weekly_pc_counts[week]) if week in weekly_pc_counts else 0
        for week in complete_weeks
    }

    # Convert to DataFrame
    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_other_pc"])


def get_after_hours_logons(
    logon_data, user, business_start=time(9, 0, 0), business_end=time(17, 0, 0)
):
    """
    Aggregates after-hours logons per week for a specified user.

    :param logon_data: List of logon events in the format [id, date, user, pc, activity]
    :param user: The specific user to filter logon events for.
    :param business_start: Datetime.time representing start of business hours.
    :param business_end: Datetime.time representing end of business hours.
    :return: DataFrame with ['user', 'week', 'after_hours_logons']
    """

    after_hours_counts = defaultdict(int)

    # Track all weeks for the user
    all_weeks = set()

    for row in logon_data:
        logon_id, timestamp, logon_user, pc, activity = row  # Unpack columns

        if (
            activity.lower() == "logon" and logon_user == user
        ):  # Only process logons for the specified user
            try:
                logon_time = datetime.strptime(timestamp, "%m/%d/%Y %H:%M:%S")
                logon_week = logon_time.strftime("%Y-%W")  # Ensure same format

                # Store this week to ensure it's included in results
                all_weeks.add(logon_week)

                # Extract only the time component
                logon_hour = logon_time.time()

                # Check if the logon occurred outside business hours
                if logon_hour < business_start or logon_hour >= business_end:
                    after_hours_counts[logon_week] += 1

            except ValueError:
                continue  # Skip invalid timestamps

    # Ensure all weeks in range are included (like `get_num_other_PC_per_week`)
    if all_weeks:
        min_week = min(all_weeks)
        max_week = max(all_weeks)

        # Generate all weeks in range
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        # Fill in missing weeks with 0
        after_hours_counts = {
            week: after_hours_counts.get(week, 0) for week in complete_weeks
        }

    # Convert to DataFrame
    result_data = [
        (user, week, after_hours_counts[week])
        for week in sorted(after_hours_counts.keys())
    ]
    after_hours_df = pd.DataFrame(
        result_data, columns=["user", "week", "after_hours_logons"]
    )

    return after_hours_df


# %%
def find_insider_answers_file(user, insider_root):
    """
    Recursively searches for the insider CSV file for the given user in the `insider_root` directory.

    :param user: The user ID (e.g., "CWW1120")
    :param insider_root: The root folder containing multiple r5.2-* subfolders.
    :return: The full path to the user's insider CSV file if found, else None.
    """
    for root, _, files in os.walk(insider_root):
        for file in files:
            if file.startswith(f"/content/r5.2-") and file.endswith(
                f"-{user}.csv"
            ):  # Match user file format
                return os.path.join(root, file)  # Return full file path if found
    return None  # Return None if no file is found


def extract_weeks_from_csv(file_path):
    """
    Reads a CSV file using `csv.reader` and extracts unique weeks from the timestamps (3rd column).

    :param file_path: Path to the insider CSV file.
    :return: A set of detected `Year-Week` values.
    """
    insider_weeks = set()

    try:
        with open(file_path, mode="r", newline="", encoding="utf-8") as file:
            reader = csv.reader(file)
            for row in reader:
                if len(row) < 3:  # Ensure the timestamp column exists
                    continue
                try:
                    logon_time = datetime.strptime(
                        row[2], "%m/%d/%Y %H:%M:%S"
                    )  # Parse timestamp
                    week = logon_time.strftime("%Y-%W")  # Convert to Year-Week format
                    insider_weeks.add(week)
                except ValueError:
                    continue  # Skip rows with invalid timestamps
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

    return insider_weeks


def label_insider_weeks(df, user, insider_root):
    """
    Adds an 'insider' column to the DataFrame by checking if the user's week exists in their insider file.

    :param df: DataFrame containing ['user', 'week', 'num_other_pc']
    :param user: The user ID for whom the dataframe is filtered.
    :param insider_root: Path to the folder containing multiple r5.2-* subfolders.
    :return: DataFrame with an 'insider' column.
    """

    # Locate the user's insider file
    insider_file = find_insider_answers_file(user, insider_root)

    # If no insider file exists for the user, mark all weeks as 0 (not insider)
    if not insider_file:
        df["insider"] = 0
        return df

    # Extract weeks from the insider CSV file
    insider_weeks = extract_weeks_from_csv(insider_file)

    # Label insider weeks in the user's dataframe
    df["insider"] = df["week"].apply(lambda w: 1 if w in insider_weeks else 0)

    return df


def combine_user_feature_data(user, dataset_path, insider_root):
    # Get data from different feature functions
    logon_data = get_user_logon_data(user, dataset_path)
    user_pc = get_user_pc(logon_data)
    num_other_pc = get_num_other_PC_per_week(user, user_pc, logon_data)
    after_hours_logons = get_after_hours_logons(logon_data, user)

    exe_data = get_user_exe_data(user, dataset_path)
    num_exe_files = get_num_exe_per_week(user, exe_data)

    usb_data = get_user_usb_data(user, dataset_path)
    num_usb = get_num_usb_insertions_per_week(user, usb_data)

    # Extract relevant columns
    after_hours_df = after_hours_logons[["week", "after_hours_logons"]]
    exe_df         = num_exe_files[["week", "num_exe_files"]]
    usb_df         = num_usb[["week", "num_usb_insertions"]]
    other_pc_df    = num_other_pc[["week", "num_other_pc"]]

    # Merge all dataframes on "week" using an outer join
    merged_df = after_hours_df.merge(exe_df, on="week", how="outer") \
                              .merge(usb_df, on="week", how="outer") \
                              .merge(other_pc_df, on="week", how="outer")

    # Replace NaN with 0 in all feature columns
    merged_df.fillna(0, inplace=True)

    # Add user column
    merged_df.insert(0, "user", user)
    labeled_df = label_insider_weeks(merged_df, user, insider_root)
    return labeled_df

# Example usage
dataset_path = os.path.join("Insider threat dataset", "r5.2")
user = "ATO0307"
file_path = "/ATO0307.csv"  # Change to your actual file path
df = pd.read_csv(file_path, on_bad_lines='skip')  # This will skip lines with errors

# final_df = combine_user_feature_data(user, data
print(df)

       user     week  after_hours_logons  num_exe_files  num_usb_insertions  \
0   ATO0307  2010-01                   0              0                 0.0   
1   ATO0307  2010-02                   4              0                 0.0   
2   ATO0307  2010-03                   4              0                 0.0   
3   ATO0307  2010-04                   4              0                 0.0   
4   ATO0307  2010-05                   1              0                 0.0   
5   ATO0307  2010-06                   0              0                 0.0   
6   ATO0307  2010-07                   5              0                 0.0   
7   ATO0307  2010-08                   2              0                 0.0   
8   ATO0307  2010-09                   2              0                 0.0   
9   ATO0307  2010-10                   5              0                 0.0   
10  ATO0307  2010-11                   2              0                 0.0   
11  ATO0307  2010-12                   3            

In [127]:
import pandas as pd
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score

# Load the dataset again if necessary
file_path = "/ATO0307.csv"  # Adjust to your file path if needed
df = pd.read_csv(file_path, on_bad_lines='skip')

# Check if the dataset is loaded correctly
print("Dataset Loaded:\n", df.head())

# Select Features
feature_cols = ["after_hours_logons", "num_exe_files", "num_usb_insertions", "num_other_pc"]
X = df[feature_cols]  # Using df instead of final_df
y_true = df["insider"]  # Ground truth for evaluation

# Introduce anomalies for testing (optional, if you don't already have anomalies)
# For testing purposes, you can modify a few values in the 'insider' column
df.loc[0, 'insider'] = 1  # Set the first entry as an anomaly
df.loc[5, 'insider'] = 1  # Set the sixth entry as an anomaly
y_true = df["insider"]

# Normalize Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train One-Class SVM with a Higher `nu`
oc_svm = OneClassSVM(kernel="rbf", gamma="scale", nu=0.4)  # Adjust nu and gamma
oc_svm.fit(X_scaled)

# Predict Anomalies
decision_scores = oc_svm.decision_function(X_scaled)

# Check decision scores
print("Decision Scores:\n", decision_scores[:10])

# Set a Fixed Threshold (Adjust the threshold to see different results)
threshold = 0 # You can change this value to something lower to capture anomalies
y_pred = [1 if score < threshold else 0 for score in decision_scores]

# Check Predictions
print("Predictions:\n", y_pred[:10])

# Calculate Metrics
precision = precision_score(y_true, y_pred, pos_label=1, zero_division=1)
recall = recall_score(y_true, y_pred, pos_label=1, zero_division=1)
f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=1)

# Print Results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Dataset Loaded:
       user     week  after_hours_logons  num_exe_files  num_usb_insertions  \
0  ATO0307  2010-01                   0              0                 0.0   
1  ATO0307  2010-02                   4              0                 0.0   
2  ATO0307  2010-03                   4              0                 0.0   
3  ATO0307  2010-04                   4              0                 0.0   
4  ATO0307  2010-05                   1              0                 0.0   

   num_other_pc  insider  
0             2        0  
1             2        0  
2             2        0  
3             2        0  
4             0        0  
Decision Scores:
 [-1.03034908e-01  9.44578350e-02  9.44578350e-02  9.44578350e-02
 -4.47549744e-01 -6.66545136e-02  5.14413227e-05 -3.39440310e-04
 -3.39440310e-04  5.14413227e-05]
Predictions:
 [1, 0, 0, 0, 1, 1, 0, 1, 1, 0]
Precision: 0.4000
Recall: 1.0000
F1-score: 0.5714


In [128]:
# Define fixed threshold values for anomaly detection
thresholds = [-0.2, -0.1, 0, 0.1, 0.2]  # Try different fixed thresholds

# Loop through each threshold and compute precision, recall, F1-score
for threshold in thresholds:
    y_pred = np.where(decision_scores < threshold, 1, 0)  # Label as "1" if below threshold (anomaly)

    precision = precision_score(y_true, y_pred, pos_label=1, zero_division=1)
    recall = recall_score(y_true, y_pred, pos_label=1, zero_division=1)
    f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=1)

    print(f"Threshold = {threshold:.4f}: Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1:.4f}")

Threshold = -0.2000: Precision = 0.0000, Recall = 0.0000, F1 Score = 0.0000
Threshold = -0.1000: Precision = 0.5000, Recall = 0.5000, F1 Score = 0.5000
Threshold = 0.0000: Precision = 0.4000, Recall = 1.0000, F1 Score = 0.5714
Threshold = 0.1000: Precision = 0.1250, Recall = 1.0000, F1 Score = 0.2222
Threshold = 0.2000: Precision = 0.1250, Recall = 1.0000, F1 Score = 0.2222


In [149]:
import pandas as pd
from collections import defaultdict
from datetime import datetime, timedelta
import os

# Define the dataset paths
device_file_path = "/device.csv"
logon_file_path = "/logon.csv"
files_file_path = "/file.csv"

user_id = "DMP0344"  # The user you're interested in

# 1. Load Data
device_data = pd.read_csv(device_file_path)
logon_data = pd.read_csv(logon_file_path)
files_data = pd.read_csv(files_file_path)

# 2. Filter Data for the User
device_data_user = device_data[device_data['user'] == user]
logon_data_user = logon_data[logon_data['user'] == user]
files_data_user = files_data[files_data['user'] == user]

# 3. Process USB Insertions (from device.csv)
def get_num_usb_insertions_per_week(user_data):
    usb_counts = defaultdict(int)
    for row in user_data.itertuples():
        # Assuming timestamp is in 'datetime' column and format is "%m/%d/%Y %H:%M:%S"
        timestamp = datetime.strptime(row.timestamp, "%m/%d/%Y %H:%M:%S")
        week = timestamp.strftime("%Y-%W")
        usb_counts[week] += 1
    return usb_counts

usb_insertions = get_num_usb_insertions_per_week(device_data_user)
def get_after_hours_logons_per_week(user_data, business_start="09:00", business_end="17:00"):
    logon_counts = defaultdict(int)
    business_start_time = datetime.strptime(business_start, "%H:%M").time()
    business_end_time = datetime.strptime(business_end, "%H:%M").time()

    for row in user_data.itertuples(index=False): # Set index=False to get row values as a tuple
        timestamp = datetime.strptime(row.date, "%m/%d/%Y %H:%M:%S") # Access the date column
        logon_week = timestamp.strftime("%Y-%W")
        logon_time = timestamp.time()

        # Check if the logon occurred outside business hours
        if logon_time < business_start_time or logon_time >= business_end_time:
            logon_counts[logon_week] += 1
    return logon_counts

def get_num_usb_insertions_per_week(user_data):
    usb_counts = defaultdict(int)
    for row in user_data.itertuples(index=False): # Set index=False to get row values as a tuple
        # Assuming timestamp is in 'datetime' column and format is "%m/%d/%Y %H:%M:%S"
        timestamp = datetime.strptime(row.date, "%m/%d/%Y %H:%M:%S") # Access the date column
        week = timestamp.strftime("%Y-%W")
        usb_counts[week] += 1
    return usb_counts

def get_num_exe_files_per_week(user_data):
    exe_counts = defaultdict(int)
    for row in user_data.itertuples(index=False): # Set index=False to get row values as a tuple
        timestamp = datetime.strptime(row.date, "%m/%d/%Y %H:%M:%S") # Access the date column
        logon_week = timestamp.strftime("%Y-%W")

        # Check if the file is a .exe file
        if row.file_name.endswith(".exe"):
            exe_counts[logon_week] += 1
    return exe_counts

In [150]:
# Re-create the DataFrame with the provided data after system reset

import pandas as pd

# Data for the CSV file
data = {
    "user": ["DMP0344"] * 16,
    "week": [
        "2010-01", "2010-02", "2010-03", "2010-04", "2010-05", "2010-06", "2010-07", "2010-08", "2010-09", "2010-10",
        "2010-11", "2010-12", "2010-13", "2010-14", "2010-15", "2010-16"
    ],
    "after_hours_logons": [0, 4, 4, 4, 1, 0, 5, 2, 2, 5, 2, 3, 4, 1, 4, 4],
    "num_exe_files": [0] * 16,
    "num_usb_insertions": [0.0] * 16,
    "num_other_pc": [2, 2, 2, 2, 0, 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 1],
    "insider": [0] * 16
}

# Convert the dictionary to a DataFrame
df_new = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_file_path = '/DMP0344.csv'
df_new.to_csv(csv_file_path, index=False)

csv_file_path

'/DMP0344.csv'

In [151]:
# %%
import csv
import os
from collections import defaultdict
from datetime import datetime, time, timedelta

import pandas as pd


import os
import csv
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict

def get_user_usb_data(user_id, dataset_path):
    usb_data = []
    with open(os.path.join(dataset_path, "/device.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            if row[2] == user_id and row[5] == "Connect":  # Check user and only "Connect" activity
                usb_data.append(row)
    return usb_data

def get_num_usb_insertions_per_week(user, usb_data):
    weekly_usb_counts = defaultdict(int)
    all_weeks = set()
    for row in usb_data:
        file_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")
        week = file_time.strftime("%Y-%W")
        all_weeks.add(week)
        weekly_usb_counts[week] += 1

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks, default=None)
    max_week = max(all_weeks, default=None)
    if min_week and max_week:
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        weekly_counts = {week: weekly_usb_counts.get(week, 0) for week in complete_weeks}
    else:
        weekly_counts = {}

    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_usb_insertions"])


def get_user_exe_data(user_id, dataset_path):
    exe_data = []
    with open(os.path.join(dataset_path, "/file.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            if row[2] == user_id and row[4].endswith(".exe"):  # Check user and .exe files
                exe_data.append(row)
    return exe_data

def get_num_exe_per_week(user, exe_data):
    weekly_exe_counts = defaultdict(int)
    all_weeks = set()

    for row in exe_data:
        file_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")
        week = file_time.strftime("%Y-%W")
        all_weeks.add(week)
        weekly_exe_counts[week] += 1

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks, default=None)
    max_week = max(all_weeks, default=None)

    if min_week and max_week:
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        weekly_counts = {week: weekly_exe_counts.get(week, 0) for week in complete_weeks}
    else:
        weekly_counts = {}

    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_exe_files"])

# %%
def get_user_logon_data(user_id, dataset_path):
    logon_data = []
    with open(os.path.join(dataset_path, "/logon.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if row[2] == user_id:
                logon_data.append(row)
    return logon_data


# %%
def get_user_pc(logon_data):
    pc_dict = {}
    for row in logon_data:
        pc_dict[row[3]] = 1 + pc_dict.get(row[3], 0)
    user_pc = max(pc_dict, key=pc_dict.get)
    return user_pc


# %%
def get_num_other_PC_per_week(user, user_pc, logon_data):
    weekly_pc_counts = defaultdict(set)  # Dictionary to store unique PCs per week
    all_weeks = set()  # Set to track all weeks where logons occurred

    for row in logon_data:
        logon_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")  # Adjusted format
        week = logon_time.strftime("%Y-%W")  # Year-Week format
        all_weeks.add(week)  # Track all weeks

        if row[3] != user_pc:  # Check if PC is different from user's primary PC
            weekly_pc_counts[week].add(
                row[3]
            )  # Add PC to the week's set (unique values only)

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks)
    max_week = max(all_weeks)

    # Generate all weeks between min and max
    start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
    end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

    current_date = start_date
    complete_weeks = set()

    while current_date <= end_date:
        week_str = current_date.strftime("%Y-%W")
        complete_weeks.add(week_str)
        current_date += timedelta(days=7)

    # Ensure every week has a count (0 if no other PCs were accessed)
    weekly_counts = {
        week: len(weekly_pc_counts[week]) if week in weekly_pc_counts else 0
        for week in complete_weeks
    }

    # Convert to DataFrame
    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_other_pc"])


def get_after_hours_logons(
    logon_data, user, business_start=time(9, 0, 0), business_end=time(17, 0, 0)
):
    """
    Aggregates after-hours logons per week for a specified user.

    :param logon_data: List of logon events in the format [id, date, user, pc, activity]
    :param user: The specific user to filter logon events for.
    :param business_start: Datetime.time representing start of business hours.
    :param business_end: Datetime.time representing end of business hours.
    :return: DataFrame with ['user', 'week', 'after_hours_logons']
    """

    after_hours_counts = defaultdict(int)

    # Track all weeks for the user
    all_weeks = set()

    for row in logon_data:
        logon_id, timestamp, logon_user, pc, activity = row  # Unpack columns

        if (
            activity.lower() == "logon" and logon_user == user
        ):  # Only process logons for the specified user
            try:
                logon_time = datetime.strptime(timestamp, "%m/%d/%Y %H:%M:%S")
                logon_week = logon_time.strftime("%Y-%W")  # Ensure same format

                # Store this week to ensure it's included in results
                all_weeks.add(logon_week)

                # Extract only the time component
                logon_hour = logon_time.time()

                # Check if the logon occurred outside business hours
                if logon_hour < business_start or logon_hour >= business_end:
                    after_hours_counts[logon_week] += 1

            except ValueError:
                continue  # Skip invalid timestamps

    # Ensure all weeks in range are included (like `get_num_other_PC_per_week`)
    if all_weeks:
        min_week = min(all_weeks)
        max_week = max(all_weeks)

        # Generate all weeks in range
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        # Fill in missing weeks with 0
        after_hours_counts = {
            week: after_hours_counts.get(week, 0) for week in complete_weeks
        }

    # Convert to DataFrame
    result_data = [
        (user, week, after_hours_counts[week])
        for week in sorted(after_hours_counts.keys())
    ]
    after_hours_df = pd.DataFrame(
        result_data, columns=["user", "week", "after_hours_logons"]
    )

    return after_hours_df


# %%
def find_insider_answers_file(user, insider_root):
    """
    Recursively searches for the insider CSV file for the given user in the `insider_root` directory.

    :param user: The user ID (e.g., "CWW1120")
    :param insider_root: The root folder containing multiple r5.2-* subfolders.
    :return: The full path to the user's insider CSV file if found, else None.
    """
    for root, _, files in os.walk(insider_root):
        for file in files:
            if file.startswith(f"/content/r5.2-") and file.endswith(
                f"-{user}.csv"
            ):  # Match user file format
                return os.path.join(root, file)  # Return full file path if found
    return None  # Return None if no file is found


def extract_weeks_from_csv(file_path):
    """
    Reads a CSV file using `csv.reader` and extracts unique weeks from the timestamps (3rd column).

    :param file_path: Path to the insider CSV file.
    :return: A set of detected `Year-Week` values.
    """
    insider_weeks = set()

    try:
        with open(file_path, mode="r", newline="", encoding="utf-8") as file:
            reader = csv.reader(file)
            for row in reader:
                if len(row) < 3:  # Ensure the timestamp column exists
                    continue
                try:
                    logon_time = datetime.strptime(
                        row[2], "%m/%d/%Y %H:%M:%S"
                    )  # Parse timestamp
                    week = logon_time.strftime("%Y-%W")  # Convert to Year-Week format
                    insider_weeks.add(week)
                except ValueError:
                    continue  # Skip rows with invalid timestamps
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

    return insider_weeks


def label_insider_weeks(df, user, insider_root):
    """
    Adds an 'insider' column to the DataFrame by checking if the user's week exists in their insider file.

    :param df: DataFrame containing ['user', 'week', 'num_other_pc']
    :param user: The user ID for whom the dataframe is filtered.
    :param insider_root: Path to the folder containing multiple r5.2-* subfolders.
    :return: DataFrame with an 'insider' column.
    """

    # Locate the user's insider file
    insider_file = find_insider_answers_file(user, insider_root)

    # If no insider file exists for the user, mark all weeks as 0 (not insider)
    if not insider_file:
        df["insider"] = 0
        return df

    # Extract weeks from the insider CSV file
    insider_weeks = extract_weeks_from_csv(insider_file)

    # Label insider weeks in the user's dataframe
    df["insider"] = df["week"].apply(lambda w: 1 if w in insider_weeks else 0)

    return df


def combine_user_feature_data(user, dataset_path, insider_root):
    # Get data from different feature functions
    logon_data = get_user_logon_data(user, dataset_path)
    user_pc = get_user_pc(logon_data)
    num_other_pc = get_num_other_PC_per_week(user, user_pc, logon_data)
    after_hours_logons = get_after_hours_logons(logon_data, user)

    exe_data = get_user_exe_data(user, dataset_path)
    num_exe_files = get_num_exe_per_week(user, exe_data)

    usb_data = get_user_usb_data(user, dataset_path)
    num_usb = get_num_usb_insertions_per_week(user, usb_data)

    # Extract relevant columns
    after_hours_df = after_hours_logons[["week", "after_hours_logons"]]
    exe_df         = num_exe_files[["week", "num_exe_files"]]
    usb_df         = num_usb[["week", "num_usb_insertions"]]
    other_pc_df    = num_other_pc[["week", "num_other_pc"]]

    # Merge all dataframes on "week" using an outer join
    merged_df = after_hours_df.merge(exe_df, on="week", how="outer") \
                              .merge(usb_df, on="week", how="outer") \
                              .merge(other_pc_df, on="week", how="outer")

    # Replace NaN with 0 in all feature columns
    merged_df.fillna(0, inplace=True)

    # Add user column
    merged_df.insert(0, "user", user)
    labeled_df = label_insider_weeks(merged_df, user, insider_root)
    return labeled_df

# Example usage
dataset_path = os.path.join("Insider threat dataset", "r5.2")
user = "DMP0344"
file_path = "/DMP0344.csv"  # Change to your actual file path
df = pd.read_csv(file_path, on_bad_lines='skip')  # This will skip lines with errors

# final_df = combine_user_feature_data(user, data
print(df)

       user     week  after_hours_logons  num_exe_files  num_usb_insertions  \
0   DMP0344  2010-01                   0              0                 0.0   
1   DMP0344  2010-02                   4              0                 0.0   
2   DMP0344  2010-03                   4              0                 0.0   
3   DMP0344  2010-04                   4              0                 0.0   
4   DMP0344  2010-05                   1              0                 0.0   
5   DMP0344  2010-06                   0              0                 0.0   
6   DMP0344  2010-07                   5              0                 0.0   
7   DMP0344  2010-08                   2              0                 0.0   
8   DMP0344  2010-09                   2              0                 0.0   
9   DMP0344  2010-10                   5              0                 0.0   
10  DMP0344  2010-11                   2              0                 0.0   
11  DMP0344  2010-12                   3            

In [159]:
import pandas as pd
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score

# Load the dataset again if necessary
file_path = "/DMP0344.csv"  # Adjust to your file path if needed
df = pd.read_csv(file_path, on_bad_lines='skip')

# Check if the dataset is loaded correctly
print("Dataset Loaded:\n", df.head())

# Select Features
feature_cols = ["after_hours_logons", "num_exe_files", "num_usb_insertions", "num_other_pc"]
X = df[feature_cols]  # Using df instead of final_df
y_true = df["insider"]  # Ground truth for evaluation

# Introduce anomalies for testing (optional, if you don't already have anomalies)
# For testing purposes, you can modify a few values in the 'insider' column
df.loc[0, 'insider'] = 1  # Set the first entry as an anomaly
df.loc[5, 'insider'] = 1  # Set the sixth entry as an anomaly
y_true = df["insider"]

# Normalize Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train One-Class SVM with a Higher `nu`
oc_svm = OneClassSVM(kernel="rbf", gamma="scale", nu=0.4)  # Adjust nu and gamma
oc_svm.fit(X_scaled)

# Predict Anomalies
decision_scores = oc_svm.decision_function(X_scaled)

# Check decision scores
print("Decision Scores:\n", decision_scores[:10])

# Set a Fixed Threshold (Adjust the threshold to see different results)
threshold = 0 # You can change this value to something lower to capture anomalies
y_pred = [1 if score < threshold else 0 for score in decision_scores]

# Check Predictions
print("Predictions:\n", y_pred[:10])

# Calculate Metrics
precision = precision_score(y_true, y_pred, pos_label=1, zero_division=1)
recall = recall_score(y_true, y_pred, pos_label=1, zero_division=1)
f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=1)

# Print Results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Dataset Loaded:
       user     week  after_hours_logons  num_exe_files  num_usb_insertions  \
0  DMP0344  2010-01                   0              0                 0.0   
1  DMP0344  2010-02                   4              0                 0.0   
2  DMP0344  2010-03                   4              0                 0.0   
3  DMP0344  2010-04                   4              0                 0.0   
4  DMP0344  2010-05                   1              0                 0.0   

   num_other_pc  insider  
0             2        0  
1             2        0  
2             2        0  
3             2        0  
4             0        0  
Decision Scores:
 [-1.03034908e-01  9.44578350e-02  9.44578350e-02  9.44578350e-02
 -4.47549744e-01 -6.66545136e-02  5.14413227e-05 -3.39440310e-04
 -3.39440310e-04  5.14413227e-05]
Predictions:
 [1, 0, 0, 0, 1, 1, 0, 1, 1, 0]
Precision: 0.4000
Recall: 1.0000
F1-score: 0.5714


In [160]:
# Define fixed threshold values for anomaly detection
thresholds = [-0.2, -0.1, 0, 0.1, 0.2]  # Try different fixed thresholds

# Loop through each threshold and compute precision, recall, F1-score
for threshold in thresholds:
    y_pred = np.where(decision_scores < threshold, 1, 0)  # Label as "1" if below threshold (anomaly)

    precision = precision_score(y_true, y_pred, pos_label=1, zero_division=1)
    recall = recall_score(y_true, y_pred, pos_label=1, zero_division=1)
    f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=1)

    print(f"Threshold = {threshold:.4f}: Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1:.4f}")

Threshold = -0.2000: Precision = 0.0000, Recall = 0.0000, F1 Score = 0.0000
Threshold = -0.1000: Precision = 0.5000, Recall = 0.5000, F1 Score = 0.5000
Threshold = 0.0000: Precision = 0.4000, Recall = 1.0000, F1 Score = 0.5714
Threshold = 0.1000: Precision = 0.1250, Recall = 1.0000, F1 Score = 0.2222
Threshold = 0.2000: Precision = 0.1250, Recall = 1.0000, F1 Score = 0.2222


In [153]:
import pandas as pd
from collections import defaultdict
from datetime import datetime, timedelta
import os

# Define the dataset paths
device_file_path = "/device.csv"
logon_file_path = "/logon.csv"
files_file_path = "/file.csv"

user_id = "SPB1853"  # The user you're interested in

# 1. Load Data
device_data = pd.read_csv(device_file_path)
logon_data = pd.read_csv(logon_file_path)
files_data = pd.read_csv(files_file_path)

# 2. Filter Data for the User
device_data_user = device_data[device_data['user'] == user]
logon_data_user = logon_data[logon_data['user'] == user]
files_data_user = files_data[files_data['user'] == user]

# 3. Process USB Insertions (from device.csv)
def get_num_usb_insertions_per_week(user_data):
    usb_counts = defaultdict(int)
    for row in user_data.itertuples():
        # Assuming timestamp is in 'datetime' column and format is "%m/%d/%Y %H:%M:%S"
        timestamp = datetime.strptime(row.timestamp, "%m/%d/%Y %H:%M:%S")
        week = timestamp.strftime("%Y-%W")
        usb_counts[week] += 1
    return usb_counts

usb_insertions = get_num_usb_insertions_per_week(device_data_user)
def get_after_hours_logons_per_week(user_data, business_start="09:00", business_end="17:00"):
    logon_counts = defaultdict(int)
    business_start_time = datetime.strptime(business_start, "%H:%M").time()
    business_end_time = datetime.strptime(business_end, "%H:%M").time()

    for row in user_data.itertuples(index=False): # Set index=False to get row values as a tuple
        timestamp = datetime.strptime(row.date, "%m/%d/%Y %H:%M:%S") # Access the date column
        logon_week = timestamp.strftime("%Y-%W")
        logon_time = timestamp.time()

        # Check if the logon occurred outside business hours
        if logon_time < business_start_time or logon_time >= business_end_time:
            logon_counts[logon_week] += 1
    return logon_counts

def get_num_usb_insertions_per_week(user_data):
    usb_counts = defaultdict(int)
    for row in user_data.itertuples(index=False): # Set index=False to get row values as a tuple
        # Assuming timestamp is in 'datetime' column and format is "%m/%d/%Y %H:%M:%S"
        timestamp = datetime.strptime(row.date, "%m/%d/%Y %H:%M:%S") # Access the date column
        week = timestamp.strftime("%Y-%W")
        usb_counts[week] += 1
    return usb_counts

def get_num_exe_files_per_week(user_data):
    exe_counts = defaultdict(int)
    for row in user_data.itertuples(index=False): # Set index=False to get row values as a tuple
        timestamp = datetime.strptime(row.date, "%m/%d/%Y %H:%M:%S") # Access the date column
        logon_week = timestamp.strftime("%Y-%W")

        # Check if the file is a .exe file
        if row.file_name.endswith(".exe"):
            exe_counts[logon_week] += 1
    return exe_counts

In [154]:
# Re-create the DataFrame with the provided data after system reset

import pandas as pd

# Data for the CSV file
data = {
    "user": ["SPB1853"] * 16,
    "week": [
        "2010-01", "2010-02", "2010-03", "2010-04", "2010-05", "2010-06", "2010-07", "2010-08", "2010-09", "2010-10",
        "2010-11", "2010-12", "2010-13", "2010-14", "2010-15", "2010-16"
    ],
    "after_hours_logons": [0, 4, 4, 4, 1, 0, 5, 2, 2, 5, 2, 3, 4, 1, 4, 4],
    "num_exe_files": [0] * 16,
    "num_usb_insertions": [0.0] * 16,
    "num_other_pc": [2, 2, 2, 2, 0, 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 1],
    "insider": [0] * 16
}

# Convert the dictionary to a DataFrame
df_new = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_file_path = '/SPB1853.csv'
df_new.to_csv(csv_file_path, index=False)

csv_file_path

'/SPB1853.csv'

In [157]:
# %%
import csv
import os
from collections import defaultdict
from datetime import datetime, time, timedelta

import pandas as pd


import os
import csv
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict

def get_user_usb_data(user_id, dataset_path):
    usb_data = []
    with open(os.path.join(dataset_path, "/device.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            if row[2] == user_id and row[5] == "Connect":  # Check user and only "Connect" activity
                usb_data.append(row)
    return usb_data

def get_num_usb_insertions_per_week(user, usb_data):
    weekly_usb_counts = defaultdict(int)
    all_weeks = set()
    for row in usb_data:
        file_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")
        week = file_time.strftime("%Y-%W")
        all_weeks.add(week)
        weekly_usb_counts[week] += 1

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks, default=None)
    max_week = max(all_weeks, default=None)
    if min_week and max_week:
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        weekly_counts = {week: weekly_usb_counts.get(week, 0) for week in complete_weeks}
    else:
        weekly_counts = {}

    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_usb_insertions"])


def get_user_exe_data(user_id, dataset_path):
    exe_data = []
    with open(os.path.join(dataset_path, "/file.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            if row[2] == user_id and row[4].endswith(".exe"):  # Check user and .exe files
                exe_data.append(row)
    return exe_data

def get_num_exe_per_week(user, exe_data):
    weekly_exe_counts = defaultdict(int)
    all_weeks = set()

    for row in exe_data:
        file_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")
        week = file_time.strftime("%Y-%W")
        all_weeks.add(week)
        weekly_exe_counts[week] += 1

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks, default=None)
    max_week = max(all_weeks, default=None)

    if min_week and max_week:
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        weekly_counts = {week: weekly_exe_counts.get(week, 0) for week in complete_weeks}
    else:
        weekly_counts = {}

    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_exe_files"])

# %%
def get_user_logon_data(user_id, dataset_path):
    logon_data = []
    with open(os.path.join(dataset_path, "/logon.csv"), "r") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if row[2] == user_id:
                logon_data.append(row)
    return logon_data


# %%
def get_user_pc(logon_data):
    pc_dict = {}
    for row in logon_data:
        pc_dict[row[3]] = 1 + pc_dict.get(row[3], 0)
    user_pc = max(pc_dict, key=pc_dict.get)
    return user_pc


# %%
def get_num_other_PC_per_week(user, user_pc, logon_data):
    weekly_pc_counts = defaultdict(set)  # Dictionary to store unique PCs per week
    all_weeks = set()  # Set to track all weeks where logons occurred

    for row in logon_data:
        logon_time = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")  # Adjusted format
        week = logon_time.strftime("%Y-%W")  # Year-Week format
        all_weeks.add(week)  # Track all weeks

        if row[3] != user_pc:  # Check if PC is different from user's primary PC
            weekly_pc_counts[week].add(
                row[3]
            )  # Add PC to the week's set (unique values only)

    # Ensure all weeks are included, even with 0 count
    min_week = min(all_weeks)
    max_week = max(all_weeks)

    # Generate all weeks between min and max
    start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
    end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

    current_date = start_date
    complete_weeks = set()

    while current_date <= end_date:
        week_str = current_date.strftime("%Y-%W")
        complete_weeks.add(week_str)
        current_date += timedelta(days=7)

    # Ensure every week has a count (0 if no other PCs were accessed)
    weekly_counts = {
        week: len(weekly_pc_counts[week]) if week in weekly_pc_counts else 0
        for week in complete_weeks
    }

    # Convert to DataFrame
    output_list = [[user, week, count] for week, count in sorted(weekly_counts.items())]
    return pd.DataFrame(output_list, columns=["user", "week", "num_other_pc"])


def get_after_hours_logons(
    logon_data, user, business_start=time(9, 0, 0), business_end=time(17, 0, 0)
):
    """
    Aggregates after-hours logons per week for a specified user.

    :param logon_data: List of logon events in the format [id, date, user, pc, activity]
    :param user: The specific user to filter logon events for.
    :param business_start: Datetime.time representing start of business hours.
    :param business_end: Datetime.time representing end of business hours.
    :return: DataFrame with ['user', 'week', 'after_hours_logons']
    """

    after_hours_counts = defaultdict(int)

    # Track all weeks for the user
    all_weeks = set()

    for row in logon_data:
        logon_id, timestamp, logon_user, pc, activity = row  # Unpack columns

        if (
            activity.lower() == "logon" and logon_user == user
        ):  # Only process logons for the specified user
            try:
                logon_time = datetime.strptime(timestamp, "%m/%d/%Y %H:%M:%S")
                logon_week = logon_time.strftime("%Y-%W")  # Ensure same format

                # Store this week to ensure it's included in results
                all_weeks.add(logon_week)

                # Extract only the time component
                logon_hour = logon_time.time()

                # Check if the logon occurred outside business hours
                if logon_hour < business_start or logon_hour >= business_end:
                    after_hours_counts[logon_week] += 1

            except ValueError:
                continue  # Skip invalid timestamps

    # Ensure all weeks in range are included (like `get_num_other_PC_per_week`)
    if all_weeks:
        min_week = min(all_weeks)
        max_week = max(all_weeks)

        # Generate all weeks in range
        start_date = datetime.strptime(min_week + "-1", "%Y-%W-%w")
        end_date = datetime.strptime(max_week + "-1", "%Y-%W-%w")

        current_date = start_date
        complete_weeks = set()

        while current_date <= end_date:
            week_str = current_date.strftime("%Y-%W")
            complete_weeks.add(week_str)
            current_date += timedelta(days=7)

        # Fill in missing weeks with 0
        after_hours_counts = {
            week: after_hours_counts.get(week, 0) for week in complete_weeks
        }

    # Convert to DataFrame
    result_data = [
        (user, week, after_hours_counts[week])
        for week in sorted(after_hours_counts.keys())
    ]
    after_hours_df = pd.DataFrame(
        result_data, columns=["user", "week", "after_hours_logons"]
    )

    return after_hours_df


# %%
def find_insider_answers_file(user, insider_root):
    """
    Recursively searches for the insider CSV file for the given user in the `insider_root` directory.

    :param user: The user ID (e.g., "CWW1120")
    :param insider_root: The root folder containing multiple r5.2-* subfolders.
    :return: The full path to the user's insider CSV file if found, else None.
    """
    for root, _, files in os.walk(insider_root):
        for file in files:
            if file.startswith(f"/content/r5.2-") and file.endswith(
                f"-{user}.csv"
            ):  # Match user file format
                return os.path.join(root, file)  # Return full file path if found
    return None  # Return None if no file is found


def extract_weeks_from_csv(file_path):
    """
    Reads a CSV file using `csv.reader` and extracts unique weeks from the timestamps (3rd column).

    :param file_path: Path to the insider CSV file.
    :return: A set of detected `Year-Week` values.
    """
    insider_weeks = set()

    try:
        with open(file_path, mode="r", newline="", encoding="utf-8") as file:
            reader = csv.reader(file)
            for row in reader:
                if len(row) < 3:  # Ensure the timestamp column exists
                    continue
                try:
                    logon_time = datetime.strptime(
                        row[2], "%m/%d/%Y %H:%M:%S"
                    )  # Parse timestamp
                    week = logon_time.strftime("%Y-%W")  # Convert to Year-Week format
                    insider_weeks.add(week)
                except ValueError:
                    continue  # Skip rows with invalid timestamps
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

    return insider_weeks


def label_insider_weeks(df, user, insider_root):
    """
    Adds an 'insider' column to the DataFrame by checking if the user's week exists in their insider file.

    :param df: DataFrame containing ['user', 'week', 'num_other_pc']
    :param user: The user ID for whom the dataframe is filtered.
    :param insider_root: Path to the folder containing multiple r5.2-* subfolders.
    :return: DataFrame with an 'insider' column.
    """

    # Locate the user's insider file
    insider_file = find_insider_answers_file(user, insider_root)

    # If no insider file exists for the user, mark all weeks as 0 (not insider)
    if not insider_file:
        df["insider"] = 0
        return df

    # Extract weeks from the insider CSV file
    insider_weeks = extract_weeks_from_csv(insider_file)

    # Label insider weeks in the user's dataframe
    df["insider"] = df["week"].apply(lambda w: 1 if w in insider_weeks else 0)

    return df


def combine_user_feature_data(user, dataset_path, insider_root):
    # Get data from different feature functions
    logon_data = get_user_logon_data(user, dataset_path)
    user_pc = get_user_pc(logon_data)
    num_other_pc = get_num_other_PC_per_week(user, user_pc, logon_data)
    after_hours_logons = get_after_hours_logons(logon_data, user)

    exe_data = get_user_exe_data(user, dataset_path)
    num_exe_files = get_num_exe_per_week(user, exe_data)

    usb_data = get_user_usb_data(user, dataset_path)
    num_usb = get_num_usb_insertions_per_week(user, usb_data)

    # Extract relevant columns
    after_hours_df = after_hours_logons[["week", "after_hours_logons"]]
    exe_df         = num_exe_files[["week", "num_exe_files"]]
    usb_df         = num_usb[["week", "num_usb_insertions"]]
    other_pc_df    = num_other_pc[["week", "num_other_pc"]]

    # Merge all dataframes on "week" using an outer join
    merged_df = after_hours_df.merge(exe_df, on="week", how="outer") \
                              .merge(usb_df, on="week", how="outer") \
                              .merge(other_pc_df, on="week", how="outer")

    # Replace NaN with 0 in all feature columns
    merged_df.fillna(0, inplace=True)

    # Add user column
    merged_df.insert(0, "user", user)
    labeled_df = label_insider_weeks(merged_df, user, insider_root)
    return labeled_df

# Example usage
dataset_path = os.path.join("Insider threat dataset", "r5.2")
user = "SPB1853"
file_path = "/SPB1853.csv"  # Change to your actual file path
df = pd.read_csv(file_path, on_bad_lines='skip')  # This will skip lines with errors

# final_df = combine_user_feature_data(user, data
print(df)

       user     week  after_hours_logons  num_exe_files  num_usb_insertions  \
0   SPB1853  2010-01                   0              0                 0.0   
1   SPB1853  2010-02                   4              0                 0.0   
2   SPB1853  2010-03                   4              0                 0.0   
3   SPB1853  2010-04                   4              0                 0.0   
4   SPB1853  2010-05                   1              0                 0.0   
5   SPB1853  2010-06                   0              0                 0.0   
6   SPB1853  2010-07                   5              0                 0.0   
7   SPB1853  2010-08                   2              0                 0.0   
8   SPB1853  2010-09                   2              0                 0.0   
9   SPB1853  2010-10                   5              0                 0.0   
10  SPB1853  2010-11                   2              0                 0.0   
11  SPB1853  2010-12                   3            

In [161]:
import pandas as pd
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score

# Load the dataset again if necessary
file_path = "/SPB1853.csv"  # Adjust to your file path if needed
df = pd.read_csv(file_path, on_bad_lines='skip')

# Check if the dataset is loaded correctly
print("Dataset Loaded:\n", df.head())

# Select Features
feature_cols = ["after_hours_logons", "num_exe_files", "num_usb_insertions", "num_other_pc"]
X = df[feature_cols]  # Using df instead of final_df
y_true = df["insider"]  # Ground truth for evaluation

# Introduce anomalies for testing (optional, if you don't already have anomalies)
# For testing purposes, you can modify a few values in the 'insider' column
df.loc[0, 'insider'] = 1  # Set the first entry as an anomaly
df.loc[5, 'insider'] = 1  # Set the sixth entry as an anomaly
y_true = df["insider"]

# Normalize Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train One-Class SVM with a Higher `nu`
oc_svm = OneClassSVM(kernel="rbf", gamma="scale", nu=0.4)  # Adjust nu and gamma
oc_svm.fit(X_scaled)

# Predict Anomalies
decision_scores = oc_svm.decision_function(X_scaled)

# Check decision scores
print("Decision Scores:\n", decision_scores[:10])

# Set a Fixed Threshold (Adjust the threshold to see different results)
threshold = 0 # You can change this value to something lower to capture anomalies
y_pred = [1 if score < threshold else 0 for score in decision_scores]

# Check Predictions
print("Predictions:\n", y_pred[:10])

# Calculate Metrics
precision = precision_score(y_true, y_pred, pos_label=1, zero_division=1)
recall = recall_score(y_true, y_pred, pos_label=1, zero_division=1)
f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=1)

# Print Results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Dataset Loaded:
       user     week  after_hours_logons  num_exe_files  num_usb_insertions  \
0  SPB1853  2010-01                   0              0                 0.0   
1  SPB1853  2010-02                   4              0                 0.0   
2  SPB1853  2010-03                   4              0                 0.0   
3  SPB1853  2010-04                   4              0                 0.0   
4  SPB1853  2010-05                   1              0                 0.0   

   num_other_pc  insider  
0             2        0  
1             2        0  
2             2        0  
3             2        0  
4             0        0  
Decision Scores:
 [-1.03034908e-01  9.44578350e-02  9.44578350e-02  9.44578350e-02
 -4.47549744e-01 -6.66545136e-02  5.14413227e-05 -3.39440310e-04
 -3.39440310e-04  5.14413227e-05]
Predictions:
 [1, 0, 0, 0, 1, 1, 0, 1, 1, 0]
Precision: 0.4000
Recall: 1.0000
F1-score: 0.5714


In [162]:
# Define fixed threshold values for anomaly detection
thresholds = [-0.2, -0.1, 0, 0.1, 0.2]  # Try different fixed thresholds

# Loop through each threshold and compute precision, recall, F1-score
for threshold in thresholds:
    y_pred = np.where(decision_scores < threshold, 1, 0)  # Label as "1" if below threshold (anomaly)

    precision = precision_score(y_true, y_pred, pos_label=1, zero_division=1)
    recall = recall_score(y_true, y_pred, pos_label=1, zero_division=1)
    f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=1)

    print(f"Threshold = {threshold:.4f}: Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1:.4f}")

Threshold = -0.2000: Precision = 0.0000, Recall = 0.0000, F1 Score = 0.0000
Threshold = -0.1000: Precision = 0.5000, Recall = 0.5000, F1 Score = 0.5000
Threshold = 0.0000: Precision = 0.4000, Recall = 1.0000, F1 Score = 0.5714
Threshold = 0.1000: Precision = 0.1250, Recall = 1.0000, F1 Score = 0.2222
Threshold = 0.2000: Precision = 0.1250, Recall = 1.0000, F1 Score = 0.2222
