In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from scipy.stats import norm, expon, uniform, pareto
from sklearn.metrics import mean_squared_error
import warnings

# Load the dataset
df = pd.read_csv("notebook/Train_data.csv")

# Exclude the class column
input_features = df.drop(columns=['class'])

# Split the dataset into training (70%) and testing (30%) sets
train_data, test_data = train_test_split(input_features, test_size=0.3, random_state=42)

# Save the splits if needed
train_data.to_csv("train_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)

print("Data preparation is complete!")

Data preparation is complete!


In [33]:
# Select only numeric columns for Z-score computation
numeric_columns = train_data.select_dtypes(include=['number'])
z_scores = (numeric_columns - numeric_columns.mean()) / numeric_columns.std()

# Function to detect anomalies based on a threshold
def detect_anomalies(z_scores, threshold=3):
    """
    Detect anomalies using Z-scores.
    Returns a DataFrame with a column 'is_anomaly' indicating anomalies.
    """
    anomalies = (np.abs(z_scores) > threshold).any(axis=1)
    return pd.DataFrame({'is_anomaly': anomalies})

# Example: Use a threshold of 3
threshold = 3
anomaly_results = detect_anomalies(z_scores, threshold)

# Add 'is_anomaly' to the original training dataset
train_data['is_anomaly'] = anomaly_results['is_anomaly']

# Save the results if needed
train_data.to_csv("train_data_with_anomalies.csv", index=False)

# Experiment with different thresholds
thresholds = [2, 2.5, 3, 3.5]
for t in thresholds:
    anomalies = detect_anomalies(z_scores, t)
    print(f"Threshold: {t}, Anomalies Detected: {anomalies['is_anomaly'].sum()}")

print("Z-score computation and anomaly detection are complete!")

Threshold: 2, Anomalies Detected: 5955
Threshold: 2.5, Anomalies Detected: 5413
Threshold: 3, Anomalies Detected: 3235
Threshold: 3.5, Anomalies Detected: 2188
Z-score computation and anomaly detection are complete!


In [39]:
original_data = pd.read_csv("notebook/Train_data.csv")
train_data = pd.read_csv("notebook/Train_data.csv")

# Bring the 'class' column back from original data
train_data['class'] = original_data.loc[train_data.index, 'class']

# Convert 'class' column to binary values
train_data['class'] = train_data['class'].map({'anomaly': 1, 'normal': 0})

# Exclude columns with zero variance
numerical_columns = train_data.select_dtypes(include=['float64', 'int64']).columns
numerical_columns = [col for col in numerical_columns if train_data[col].std() > 0]

# Replace NaN/Inf values
train_data = train_data.replace([np.inf, -np.inf], np.nan).dropna()

# Function to calculate MSE
def calculate_mse(data, dist, params):
    """Calculate the Mean Square Error for a given distribution."""
    pdf = dist.pdf(data, *params)
    mse = ((data - pdf) ** 2).mean()
    return mse

# Function to fit a distribution and compute MSE
def fit_distribution(data, dist_name):
    """Fit a distribution to the data and calculate MSE."""
    dist = getattr(stats, dist_name)
    params = dist.fit(data)
    mse = calculate_mse(data, dist, params)
    return mse, params

# Numerical columns
numerical_columns = train_data.select_dtypes(include=['float64', 'int64']).columns

# Results dictionary
pdf_results = {}

# Handle NaN values and skip columns with NaN data
for column in numerical_columns:
    print(f"Processing column: {column}")
    data = train_data[column]
    
    # Skip column if it contains NaN values
    if data.isna().sum() > 0:
        print(f"Skipping column {column} due to NaN values.")
        continue
    
    # Skip column if it has constant or zero variance
    if data.nunique() <= 1 or data.std() == 0:
        print(f"Skipping column {column} due to insufficient variance or constant values.")
        continue
    
    column_results = {}
    distributions = ['norm', 'expon', 'uniform', 'pareto']
    
    for dist_name in distributions:
        mse_all, params_all = fit_distribution(data, dist_name)
        column_results[dist_name] = {'mse': mse_all, 'params': params_all}
    
    # Choose the best distribution
    best_fit = min(column_results.items(), key=lambda x: x[1]['mse'])
    print(f"Best fit for {column}: {best_fit[0]} with MSE {best_fit[1]['mse']}")

Processing column: duration
Best fit for duration: uniform with MSE 7310352.69666604
Processing column: src_bytes
Best fit for src_bytes: expon with MSE 5812343958441.492
Processing column: dst_bytes
Best fit for dst_bytes: expon with MSE 7902776285.225545
Processing column: land
Best fit for land: uniform with MSE 0.9999206097173706
Processing column: wrong_fragment
Best fit for wrong_fragment: uniform with MSE 0.16356162450160544
Processing column: urgent
Best fit for urgent: uniform with MSE 0.9999603048586853
Processing column: hot
Best fit for hot: uniform with MSE 4.67464530526655
Processing column: num_failed_logins
Best fit for num_failed_logins: uniform with MSE 0.06396872022864401
Processing column: logged_in
Best fit for logged_in: norm with MSE 0.36213640243069417
Processing column: num_compromised
Best fit for num_compromised: uniform with MSE 108.56831715671322
Processing column: root_shell
Best fit for root_shell: uniform with MSE 0.9984518894887265
Processing column: su

In [40]:
# Convert 'class' column to binary (0 for normal, 1 for anomaly)
train_data['class'] = train_data['class'].map({'normal': 0, 'anomaly': 1})

# Identify categorical columns
categorical_columns = train_data.select_dtypes(include=['object']).columns

# Function to calculate PMF
def calculate_pmf(data):
    """Calculate the Probability Mass Function (PMF) for categorical data."""
    value_counts = data.value_counts()
    total_count = len(data)
    pmf = value_counts / total_count
    return pmf

# Dictionary to store the PMFs for all categorical columns
pmf_results = {}

# Calculate PMF for each categorical column
for column in categorical_columns:
    print(f"Calculating PMF for column: {column}")
    
    # Calculate the overall PMF for the column
    pmf_results[column] = {}
    pmf_results[column]['overall_pmf'] = calculate_pmf(train_data[column])
    
    # Calculate the conditional PMF for anomaly (class == 1)
    anomaly_data = train_data[train_data['class'] == 1]
    pmf_results[column]['anomaly_pmf'] = calculate_pmf(anomaly_data[column])
    
    # Calculate the conditional PMF for normal (class == 0)
    normal_data = train_data[train_data['class'] == 0]
    pmf_results[column]['normal_pmf'] = calculate_pmf(normal_data[column])

# Print PMF results for verification
for column, pmf_data in pmf_results.items():
    print(f"\nPMF for column: {column}")
    print("Overall PMF:")
    print(pmf_data['overall_pmf'])
    print("Conditional PMF for Anomalies (class=1):")
    print(pmf_data['anomaly_pmf'])
    print("Conditional PMF for Normal (class=0):")
    print(pmf_data['normal_pmf'])

Calculating PMF for column: protocol_type
Calculating PMF for column: service
Calculating PMF for column: flag

PMF for column: protocol_type
Overall PMF:
protocol_type
tcp     0.814782
udp     0.119522
icmp    0.065695
Name: count, dtype: float64
Conditional PMF for Anomalies (class=1):
Series([], Name: count, dtype: float64)
Conditional PMF for Normal (class=0):
Series([], Name: count, dtype: float64)

PMF for column: service
Overall PMF:
service
http         0.317680
private      0.172714
domain_u     0.072245
smtp         0.057518
ftp_data     0.055414
               ...   
urh_i        0.000159
pm_dump      0.000119
red_i        0.000119
tim_i        0.000079
http_8001    0.000040
Name: count, Length: 66, dtype: float64
Conditional PMF for Anomalies (class=1):
Series([], Name: count, dtype: float64)
Conditional PMF for Normal (class=0):
Series([], Name: count, dtype: float64)

PMF for column: flag
Overall PMF:
flag
SF        0.594355
S0        0.278223
REJ       0.087964
RSTR     