In [None]:
import pandas as pd
from scipy.stats import norm

df = pd.read_csv('Bayes.csv')
spam_count = df['Spam'].where(lambda x: x == 'Yes').dropna().count()
not_spam_count = df['Spam'].where(lambda x: x == 'No').dropna().count()
p_of_spam = spam_count/(spam_count+not_spam_count)
p_of_not_spam = not_spam_count/(spam_count+not_spam_count)

In [None]:
p_offer = (df['Contains_Offer'].where(lambda x: x == 'Yes').dropna().count())/10000
p_not_offer = (df['Contains_Offer'].where(lambda x: x == 'No').dropna().count())/10000
p_spam_and_offer = df[(df['Contains_Offer'] == "Yes") & (df['Spam'] == "Yes")].shape[0]/10000
p_not_spam_and_offer = df[(df['Contains_Offer'] == "Yes") & (df['Spam'] == "No")].shape[0]/10000
p_offer_if_spam = p_spam_and_offer/p_of_spam
p_offer_if_not_spam = p_not_spam_and_offer/p_of_not_spam

In [None]:
p_win = df[df['Contains_Win'] == "Yes"].shape[0]/10000
p_not_win = df[df['Contains_Win'] == "No"].shape[0]/10000
p_spam_and_win = df[(df['Contains_Win'] == "Yes") & (df['Spam'] == "Yes")].shape[0]/10000
p_not_spam_and_win = df[(df['Contains_Win'] == "Yes") & (df['Spam'] == "No")].shape[0]/10000
p_win_given_spam = p_spam_and_win/p_of_spam
p_win_given_not_spam = p_not_spam_and_win/p_of_not_spam

In [None]:
p_fin = df[df['Contains_Financial'] == "Yes"].shape[0]/10000
p_not_fin = df[df['Contains_Financial'] == "No"].shape[0]/10000
p_spam_and_fin = df[(df['Contains_Financial'] == "Yes") & (df['Spam'] == "Yes")].shape[0]/10000
p_not_spam_and_fin = df[(df['Contains_Financial'] == "Yes") & (df['Spam'] == "No")].shape[0]/10000
p_fin_given_spam = p_spam_and_fin/p_of_spam
p_fin_given_not_spam = p_not_spam_and_fin/p_of_not_spam

In [None]:
def gaussian_likelihood(x, mean, std):
    return norm.pdf(x, mean, std)

In [None]:
email_length_mean_spam = df[df['Spam']=="Yes"]['Email_Length'].mean()
email_length_std_spam = df[df['Spam']=="Yes"]['Email_Length'].std()

email_length_mean_not_spam = df[df['Spam']=="No"]['Email_Length'].mean()
email_length_std_not_spam = df[df['Spam']=="No"]['Email_Length'].std()

capital_ratio_mean_spam = df[df['Spam']=="Yes"]['Capital_Letter_Ratio'].mean()
capital_ratio_std_spam = df[df['Spam']=="Yes"]['Capital_Letter_Ratio'].std()

capital_ratio_mean_not_spam = df[df['Spam']=="No"]['Capital_Letter_Ratio'].mean()
capital_ratio_std_not_spam = df[df['Spam']=="No"]['Capital_Letter_Ratio'].std()

In [None]:
def calculate_posterior(email):
    # Extract features from email
    contains_offer = email['Contains_Offer']
    contains_win = email['Contains_Win']
    contains_financial = email['Contains_Financial']
    email_length = email['Email_Length']
    capital_letter_ratio = email['Capital_Letter_Ratio']

    # Calculate likelihoods for spam
    likelihood_spam = (p_offer_if_spam if contains_offer == "Yes" else (1 - p_offer_if_spam)) * \
                      (p_win_given_spam if contains_win == "Yes" else (1 - p_win_given_spam)) * \
                      (p_fin_given_spam if contains_financial == "Yes" else (1 - p_fin_given_spam)) * \
                      gaussian_likelihood(email_length, email_length_mean_spam, email_length_std_spam) * \
                      gaussian_likelihood(capital_letter_ratio, capital_ratio_mean_spam, capital_ratio_std_spam)

    # Calculate likelihoods for not spam
    likelihood_not_spam = (p_offer_if_not_spam if contains_offer == "Yes" else (1 - p_offer_if_not_spam)) * \
                          (p_win_given_not_spam if contains_win == "Yes" else (1 - p_win_given_not_spam)) * \
                          (p_fin_given_not_spam if contains_financial == "Yes" else (1 - p_fin_given_not_spam)) * \
                          gaussian_likelihood(email_length, email_length_mean_not_spam, email_length_std_not_spam) * \
                          gaussian_likelihood(capital_letter_ratio, capital_ratio_mean_not_spam, capital_ratio_std_not_spam)

    # Calculate posterior probabilities
    posterior_spam = likelihood_spam * p_of_spam
    posterior_not_spam = likelihood_not_spam * p_of_not_spam
    # print(posterior_spam - posterior_not_spam)
    # Return classification based on higher posterior probability
    if posterior_spam > posterior_not_spam:
        return "Spam"
    else:
        return "Not Spam"

In [None]:
emails = [
    {'Contains_Offer': "No", 'Contains_Win': "Yes", 'Contains_Financial': "No", 'Email_Length': 450, 'Capital_Letter_Ratio': 0.2},
    {'Contains_Offer': "Yes", 'Contains_Win': "Yes", 'Contains_Financial': "No", 'Email_Length': 150, 'Capital_Letter_Ratio': 0.05},
    {'Contains_Offer': "No", 'Contains_Win': "No", 'Contains_Financial': "Yes", 'Email_Length': 500, 'Capital_Letter_Ratio': 0.25},
    {'Contains_Offer': "Yes", 'Contains_Win': "No", 'Contains_Financial': "No", 'Email_Length': 300, 'Capital_Letter_Ratio': 0.08},
    {'Contains_Offer': "No", 'Contains_Win': "Yes", 'Contains_Financial': "Yes", 'Email_Length': 450, 'Capital_Letter_Ratio': 0.3}
]

# Classify each email
for i, email in enumerate(emails):
    classification = calculate_posterior(email)
    print(f"Email {i+1} is classified as: {classification}")

Email 1 is classified as: Spam
Email 2 is classified as: Spam
Email 3 is classified as: Spam
Email 4 is classified as: Not Spam
Email 5 is classified as: Not Spam
