In [1]:
import re
from collections import defaultdict
import math

In [17]:
import os

spam_path = "C:\\Users\\Gausar\\OneDrive\\Documents\\2024namar\\AI\\lab6\\spam_data\\train\\spam"

spam_list = os.listdir(spam_path)
spam_files = []
for file_name in spam_list:
    file_path = os.path.join(spam_path, file_name)
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            content = file.read()
            spam_files.append((content, 1)) 

print(spam_files)




In [20]:
import os

ham_path = "C:\\Users\\Gausar\\OneDrive\\Documents\\2024namar\\AI\\lab6\\spam_data\\train\\ham"
ham_list = os.listdir(ham_path)
ham_files = []

for file_name in ham_list:
    file_path = os.path.join(ham_path, file_name)
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            ham_files.append((content, 0)) 

print(ham_files)




In [19]:
import os

test_spam_path = "C:\\Users\\Gausar\\OneDrive\\Documents\\2024namar\\AI\\lab6\\spam_data\\dev\\spam"

test_spam_list = os.listdir(test_spam_path)
spam_tests = []

for file_name in test_spam_list:
    file_path = os.path.join(test_spam_path, file_name)
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            content = file.read()
            spam_tests.append((content)) 

print(spam_tests)




In [44]:
import os

test_ham_path = "C:\\Users\\Gausar\\OneDrive\\Documents\\2024namar\\AI\\lab6\\spam_data\\dev\\ham"

test_ham_list = os.listdir(test_ham_path)
ham_tests = []
for file_name in test_ham_list:
    file_path = os.path.join(test_ham_path, file_name)
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            ham_tests.append((content))
print(ham_tests)





In [45]:
len(ham_tests)

500

In [6]:
def tokenize(text):
    text = text.lower()
    words = re.findall(r'\b\w+\b', text)
    return words


In [7]:
def build_vocabulary(emails):
    vocabulary = set()
    word_counts = defaultdict(lambda: [0, 0])
    
    for email, label in emails:
        words = tokenize(email)
        for word in words:
            vocabulary.add(word)
            word_counts[word][label] += 1
    
    return vocabulary, word_counts


In [21]:
training_data = spam_files + ham_files

# 1: Өгөгдөл бэлтгэх
spam_emails = sum(1 for _, label in training_data if label == 1)
ham_emails = len(training_data) - spam_emails
total_emails = len(training_data)

vocabulary, word_counts = build_vocabulary(training_data)

In [22]:
len(vocabulary)

27866

In [23]:
word_counts

defaultdict(<function __main__.build_vocabulary.<locals>.<lambda>()>,
            {'subject': [1761, 1123],
             'you': [1243, 1873],
             'are': [528, 840],
             'not': [407, 783],
             'the': [4410, 5167],
             'only': [87, 310],
             'one': [113, 303],
             'who': [80, 104],
             'suffers': [0, 3],
             'from': [713, 686],
             'this': [1094, 1729],
             'pain': [0, 62],
             'striktuur': [0, 1],
             'delibertly': [0, 1],
             'cnusc': [0, 1],
             '_': [873, 667],
             'receivd': [0, 1],
             'd': [369, 240],
             'forsaken': [0, 1],
             'ph': [2, 15],
             'arm': [0, 14],
             'eur': [0, 1],
             'opean': [0, 1],
             '0': [221, 404],
             'vern': [1, 1],
             'i': [1142, 756],
             'ght': [0, 1],
             'shippi': [0, 1],
             'ng': [1, 17],
             'http'

In [24]:
total_emails

2000

In [25]:
def calculate_word_probabilities(word_counts, total_spam_words, total_ham_words, vocab_size, alpha):
    word_probs = {}
    
    for word, (ham_count, spam_count) in word_counts.items():
        #alpha ni laplaciin smoothing koefficient
        prob_w_given_spam = (spam_count + alpha) / (total_spam_words + alpha * vocab_size)
        prob_w_given_ham = (ham_count + alpha) / (total_ham_words + alpha * vocab_size)
        word_probs[word] = (prob_w_given_spam, prob_w_given_ham)
    
    return word_probs

In [26]:
def calculate_prior_probabilities(spam_emails, ham_emails, total_emails):
    prob_spam = spam_emails / total_emails
    prob_ham = ham_emails / total_emails
    return prob_spam, prob_ham

In [27]:
total_spam_words = sum(word_counts[word][1] for word in vocabulary)
total_ham_words = sum(word_counts[word][0] for word in vocabulary)


In [28]:
total_spam_words

179765

In [29]:
total_ham_words

143323

In [30]:

def classify(email, word_probs, prob_spam, prob_ham, vocab_size, alpha):
    words = tokenize(email)
    log_spam_prob = math.log(prob_spam)
    log_ham_prob = math.log(prob_ham)
    
    for word in words:
        if word in word_probs:
            log_spam_prob += math.log(word_probs[word][0])
            log_ham_prob += math.log(word_probs[word][1])
        else:
            log_spam_prob += math.log(alpha / (total_spam_words + alpha * vocab_size))
            log_ham_prob += math.log(alpha / (total_ham_words + alpha * vocab_size))
    
    return "spam" if log_spam_prob > log_ham_prob else "ham"

In [34]:
# Laplace smoothing
alpha = 1
vocab_size = len(vocabulary)

# 2: магадлалыг нь бодох
word_probs = calculate_word_probabilities(word_counts, total_spam_words, total_ham_words, vocab_size, alpha)
prob_spam, prob_ham = calculate_prior_probabilities(spam_emails, ham_emails, total_emails)


In [35]:
word_probs

{'subject': (0.005413449822040062, 0.01029271740590809),
 'you': (0.009025627194397754, 0.007266822050482216),
 'are': (0.004050454893537093, 0.0030901518205024856),
 'not': (0.003775929413237908, 0.002383330704659762),
 'the': (0.02489031021379274, 0.025766842495721105),
 'only': (0.0014978495504043231, 0.0005140517206128899),
 'one': (0.0014641358949289845, 0.0006659306380666982),
 'who': (0.0005057048321300769, 0.00047316124283686454),
 'suffers': (1.926494598590769e-05, 5.841496825146475e-06),
 'from': (0.003308754473079646, 0.004170828733154584),
 'this': (0.008332089138905078, 0.006396439023535391),
 'pain': (0.00030342289927804615, 5.841496825146475e-06),
 'striktuur': (9.632472992953845e-06, 5.841496825146475e-06),
 'delibertly': (9.632472992953845e-06, 5.841496825146475e-06),
 'cnusc': (9.632472992953845e-06, 5.841496825146475e-06),
 '_': (0.0032172459796465844, 0.00510546822517802),
 'receivd': (9.632472992953845e-06, 5.841496825146475e-06),
 'd': (0.0011607129956509384, 0.00

In [36]:
prob_spam

0.5

In [37]:
prob_ham

0.5

In [38]:
# 3: Шинэ мэйлүүдийг ангилах
test_emails = spam_tests

print("Spam ангилалын үр дүн :")
t = 0
for email in test_emails:
    result = classify(email, word_probs, prob_spam, prob_ham, vocab_size, alpha)
    if result == "spam":
        t+=1
    print(f"Email: '{email}' => {result}")
if t == len(spam_tests):
    print("100%  zov spam taamaglasan baina")
else:
    print(f"{t / len(spam_tests)*100}-tai zov taamaglaj bna")

Spam ангилалын үр дүн :
Email: 'Subject: dobmeos with hgh my energy level has gone up ! stukm
introducing
doctor - formulated
hgh
human growth hormone - also called hgh
is referred to in medical science as the master hormone . it is very plentiful
when we are young , but near the age of twenty - one our bodies begin to produce
less of it . by the time we are forty nearly everyone is deficient in hgh ,
and at eighty our production has normally diminished at least 90 - 95 % .
advantages of hgh :
- increased muscle strength
- loss in body fat
- increased bone density
- lower blood pressure
- quickens wound healing
- reduces cellulite
- improved vision
- wrinkle disappearance
- increased skin thickness texture
- increased energy levels
- improved sleep and emotional stability
- improved memory and mental alertness
- increased sexual potency
- resistance to common illness
- strengthened heart muscle
- controlled cholesterol
- controlled mood swings
- new hair growth and color restore
read
m

In [46]:
test_emails = ham_tests

print("Ham ангилалын үр дүн :")
t = 0
for email in test_emails:
    result = classify(email, word_probs, prob_spam, prob_ham, vocab_size, alpha)
    if result == "ham":
        t+=1
    print(f"Email: '{email}' => {result}")
if t == len(ham_tests):
    print("100%  zov ham taamaglasan baina")
else:
    print(f"{t / len(ham_tests) *100}-tai zov taamaglaj bna")

Ham ангилалын үр дүн :
Email: 'Subject: new arrival ? ?
hi out there ,
welllllllll ! ! !
what ' s the word ?
assume baby is here and you are all kicked back with nothing going on
( chuckle ) .
when you have time give me a call ( again chuckle )
ken
- attl . htm' => spam
Email: 'Subject: re : new turn - ons
the deals are all in . meter 9827 is on deal 223423 , and meter 9828 is on
deal 223458 .' => ham
Email: 'Subject: new turn - ons
tom ,
production commenced to flow as stated below ; please create a base load
ticket in sitara based upon the following information :
counterparty meter no . volumes price period
union gas corp . 9827 3 , 000 mmbtu / d 100 % gas daily less $ 0 . 10 3 / 9 - 3 / 31
cico oil & gas co . 9828 1 , 200 mmbtu / d 100 % gas daily less $ 0 . 095 3 / 15 - 3 / 31
thanks ,
vlt
x 3 - 6353' => ham
Email: 'Subject: 23 rd noms
- - - - - - - - - - - - - - - - - - - - - - forwarded by ami chokshi / corp / enron on 03 / 22 / 2000
03 : 42 pm - - - - - - - - - - - - - - - - - -

In [None]:
def compute_confusion_matrix(test_emails, prob_spam, prob_non_spam, word_prob_spam, word_prob_non_spam):
    TP = FP = TN = FN = 0
    
    for email, actual_label in test_emails:
        predicted_label = classify(email, prob_spam, prob_non_spam, word_prob_spam, word_prob_non_spam)
        
        if predicted_label == 1 and actual_label == 1:
            TP += 1
        elif predicted_label == 1 and actual_label == 0:
            FP += 1
        elif predicted_label == 0 and actual_label == 0:
            TN += 1
        elif predicted_label == 0 and actual_label == 1:
            FN += 1

    return TP, FP, TN, FN

# Example usage
emails = [('sample spam message', 1), ('sample non-spam message', 0)]  # Add your dataset here
train_emails = emails[:int(len(emails) * 0.8)]  # 80% for training
test_emails = emails[int(len(emails) * 0.8):]  # 20% for testing

vocabulary, word_counts = build_vocabulary(train_emails)
prob_spam, prob_non_spam, word_prob_spam, word_prob_non_spam = train_naive_bayes(train_emails, vocabulary, word_counts)
TP, FP, TN, FN = compute_confusion_matrix(test_emails, prob_spam, prob_non_spam, word_prob_spam, word_prob_non_spam)

# Print confusion matrix
print("Confusion Matrix:")
print(f"True Positive (TP): {TP}")
print(f"False Positive (FP): {FP}")
print(f"True Negative (TN): {TN}")
print(f"False Negative (FN): {FN}")
