<a href="https://colab.research.google.com/github/Kishanmvs/MachineLearningLabWork/blob/main/Lab2-Bayesian.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kaggleprollc/spam-email-data-uci")
print("Path to dataset files:", path)

Using Colab cache for faster access to the 'spam-email-data-uci' dataset.
Path to dataset files: /kaggle/input/spam-email-data-uci


In [None]:
import pandas as pd

# Define the path to the data file
data_path = f"{path}/spambase.data"
names_path = f"{path}/spambase.names"

# Load column names from UCI documentation (manually defined here)
column_names = [
    "word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d", "word_freq_our",
    "word_freq_over", "word_freq_remove", "word_freq_internet", "word_freq_order", "word_freq_mail",
    "word_freq_receive", "word_freq_will", "word_freq_people", "word_freq_report", "word_freq_addresses",
    "word_freq_free", "word_freq_business", "word_freq_email", "word_freq_you", "word_freq_credit",
    "word_freq_your", "word_freq_font", "word_freq_000", "word_freq_money", "word_freq_hp",
    "word_freq_hpl", "word_freq_george", "word_freq_650", "word_freq_lab", "word_freq_labs",
    "word_freq_telnet", "word_freq_857", "word_freq_data", "word_freq_415", "word_freq_85",
    "word_freq_technology", "word_freq_1999", "word_freq_parts", "word_freq_pm", "word_freq_direct",
    "word_freq_cs", "word_freq_meeting", "word_freq_original", "word_freq_project", "word_freq_re",
    "word_freq_edu", "word_freq_table", "word_freq_conference", "char_freq_;", "char_freq_(",
    "char_freq_[", "char_freq_!", "char_freq_$", "char_freq_#", "capital_run_length_average",
    "capital_run_length_longest", "capital_run_length_total", "spam"
]

# Load the dataset
df = pd.read_csv(data_path, header=None, names=column_names)
print(df.head())

   word_freq_make  word_freq_address  word_freq_all  word_freq_3d  \
0            0.00               0.64           0.64           0.0   
1            0.21               0.28           0.50           0.0   
2            0.06               0.00           0.71           0.0   
3            0.00               0.00           0.00           0.0   
4            0.00               0.00           0.00           0.0   

   word_freq_our  word_freq_over  word_freq_remove  word_freq_internet  \
0           0.32            0.00              0.00                0.00   
1           0.14            0.28              0.21                0.07   
2           1.23            0.19              0.19                0.12   
3           0.63            0.00              0.31                0.63   
4           0.63            0.00              0.31                0.63   

   word_freq_order  word_freq_mail  ...  char_freq_;  char_freq_(  \
0             0.00            0.00  ...         0.00        0.000   
1 

In [None]:
# Features and labels
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values  # 1 = spam, 0 = not spam

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
import numpy as np
from scipy.stats import multivariate_normal

classes = np.unique(y_train)
priors = {c: np.mean(y_train == c) for c in classes}
means = {}
covariances = {}

for c in classes:
    X_c = X_train[y_train == c]
    means[c] = np.mean(X_c, axis=0)
    covariances[c] = np.cov(X_c, rowvar=False)

In [None]:
def predict_bayes(X):
    predictions = []
    for x in X:
        posteriors = []
        for c in classes:
            likelihood = multivariate_normal.pdf(
                x, mean=means[c], cov=covariances[c], allow_singular=True
            )
            posterior = likelihood * priors[c]
            posteriors.append(posterior)
        predictions.append(np.argmax(posteriors))
    return np.array(predictions)

In [None]:
# Run Bayesian prediction
y_pred = predict_bayes(X_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8428674873280232
Confusion Matrix:
 [[625 179]
 [ 38 539]]


In [None]:
# Count total number of spam emails
total_spam = df['spam'].sum()
print(f"Total number of spam emails: {total_spam}")

Total number of spam emails: 1813


In [None]:
total_non_spam = (df['spam'] == 0).sum()
print(f"Total number of non-spam emails: {total_non_spam}")

Total number of non-spam emails: 2788
