In [7]:
import pandas as pd
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
import numpy as np
import pandas.api.types as ptypes
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.preprocessing import OneHotEncoder

# Load your data
df = pd.read_csv("train_data.csv")

# Step 1: Split data into features and target
X = df.drop(columns='class')  # Features (everything except 'class')
y = df['class']  # Target variable (anomaly or normal)

# Split data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Fit the distributions (PDF for continuous features, PMF for categorical features)
def fit_distributions(X_train, y_train, epsilon=1e-6):
    distributions = {
        'continuous': {},
        'categorical': {}
    }
    
    # Fit continuous features (PDFs)
    for feature in X_train.select_dtypes(include='number').columns:
        # For 'anomaly' class
        data_anomaly = X_train[y_train == 'anomaly'][feature]
        mean_anomaly = data_anomaly.mean()
        std_anomaly = data_anomaly.std()
        
        # For 'normal' class
        data_normal = X_train[y_train == 'normal'][feature]
        mean_normal = data_normal.mean()
        std_normal = data_normal.std()
        
        # Avoid division by zero: add epsilon to the standard deviation if it's zero
        std_anomaly = std_anomaly if std_anomaly > 0 else epsilon
        std_normal = std_normal if std_normal > 0 else epsilon
        
        distributions['continuous'][feature] = {
            'anomaly': norm(loc=mean_anomaly, scale=std_anomaly),
            'normal': norm(loc=mean_normal, scale=std_normal)
        }
    
    # Fit categorical features (PMFs)
    for feature in X_train.select_dtypes(include='object').columns:
        pmf_anomaly = X_train[y_train == 'anomaly'][feature].value_counts(normalize=True).to_dict()
        pmf_normal = X_train[y_train == 'normal'][feature].value_counts(normalize=True).to_dict()
        
        distributions['categorical'][feature] = {
            'anomaly': pmf_anomaly,
            'normal': pmf_normal
        }
    
    return distributions

# Step 3: Calculate probability of anomaly and normal for each row in the test set
def calculate_probability(row, distributions, priors):
    prob_anomaly = priors['anomaly']
    prob_normal = priors['normal']
    
    for feature, value in row.items():
        if ptypes.is_numeric_dtype(value):  # Only check for NaN or zero if the value is numeric
            if np.isnan(value) or value == 0:
                continue  # Skip NaN or zero values
        
        if feature in distributions['continuous']:
            # Use PDF for continuous features
            prob_anomaly *= distributions['continuous'][feature]['anomaly'].pdf(value)
            prob_normal *= distributions['continuous'][feature]['normal'].pdf(value)
        elif feature in distributions['categorical']:
            # Use PMF for categorical features
            prob_anomaly *= distributions['categorical'][feature]['anomaly'].get(value, 0)
            prob_normal *= distributions['categorical'][feature]['normal'].get(value, 0)
    
    return prob_anomaly, prob_normal

# Step 4: Predict labels for the test set
def predict(X_test, distributions, priors):
    predictions = []
    
    for _, row in X_test.iterrows():
        prob_anomaly, prob_normal = calculate_probability(row, distributions, priors)
        
        if prob_anomaly > prob_normal:
            predictions.append('anomaly')
        else:
            predictions.append('normal')
    
    return predictions

# Step 5: Calculate priors (probability of anomaly and normal)
priors = {
    'anomaly': y_train.value_counts(normalize=True)['anomaly'],
    'normal': y_train.value_counts(normalize=True)['normal']
}

# Step 6: Fit the distributions on the training data
distributions = fit_distributions(X_train, y_train)

# Step 7: Predict on the test set
predictions = predict(X_test, distributions, priors)

# Step 8: Evaluate the model
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, pos_label='anomaly')
recall = recall_score(y_test, predictions, pos_label='anomaly')

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")


#There are differences between two models (mine and my colleagues (elfeel)) 
#Mine handles invalid points like zeros or NaNs by removing them and doesnt precompute log values for pdf calculations
#Feels precomputes log-pdfs first for all data points including a small smoothing constant then reverts back to regular prob.
#Mine assigns a very small probability for missing values (epsilon) while his assigns -np.inf which could penalize instances
#Mine directly multiplies raw prior*likelihood, his adds the pre-log-prior to the log-probabilities for numerical stability
#Mine uses an iterative approach instead of a vectorized approach

Accuracy: 0.910895018852947
Precision: 0.9190726159230096
Recall: 0.8883720930232558


In [11]:
# Load the dataset
data = pd.read_csv('train_data.csv')

# Identify numerical and categorical columns
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_columns = data.select_dtypes(include=['object']).columns.tolist()

# Extract target and features
X = data.drop(columns=['class'])
y = data['class']

# One-hot encode categorical columns
encoder = OneHotEncoder(sparse_output=False, drop='first')
categorical_encoded = encoder.fit_transform(data[categorical_columns])
categorical_encoded = pd.DataFrame(categorical_encoded, columns=encoder.get_feature_names_out(categorical_columns))

# Combine encoded categorical features with numerical features
X_encoded = pd.concat([data[numerical_columns], categorical_encoded], axis=1)

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)

# Initialize models
models = {
    'GaussianNB': GaussianNB(),
    'MultinomialNB': MultinomialNB(),
    'BernoulliNB': BernoulliNB()
}

# Evaluate models
results = []
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label='anomaly')
    recall = recall_score(y_test, y_pred, pos_label='anomaly')

    # Append results
    results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall
    })

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results)

print("Model Performance:")
print(results_df)

Model Performance:
           Model  Accuracy  Precision    Recall
0     GaussianNB  0.557026   0.745098  0.075504
1  MultinomialNB  0.583488   0.755102  0.157536
2    BernoulliNB  0.953824   0.992246  0.908033
