In [9]:
import pandas as pd
import numpy as np

In [10]:
def count_classes_and_words(filename):
    total_ham_emails = 0
    total_spam_emails = 0

    ham_word_counts = {}
    spam_word_counts = {}

    with open(filename) as f:
        for line in f:
            label, words = line.split("\t")
            if label == "0":
                total_ham_emails +=1

                for word in words.split(", "):
                    word = word.strip()
                    ham_word_counts[word] = ham_word_counts.get(word, 0) + 1
            else:
                total_spam_emails += 1
                for word in words.split(", "):
                    word = word.strip()
                    spam_word_counts[word] = spam_word_counts.get(word, 0) + 1
        
        return total_ham_emails, total_spam_emails, ham_word_counts, spam_word_counts
        

#df = pd.read_csv("subset.csv", sep="\t")

#print(df.head())

In [11]:
def group1():
    total_ham_emails, total_spam_emails,ham_word_counts, spam_word_counts =  count_classes_and_words("subset.csv")
    spam_percentage = total_spam_emails/(total_spam_emails + total_ham_emails)
    print(round(spam_percentage* 100),"%")
    print(ham_word_counts['free'])
    print(spam_word_counts['free'])
group1()

13 %
12
43


In [78]:
def to_log_probabilities(total_ham_emails,total_spam_emails,ham_word_counts,spam_word_counts, a):
    SMALL = 1e-5
    total_words_in_ham = sum(ham_word_counts.values())
    total_words_in_spam = sum(spam_word_counts.values())

    vocabulary = set(ham_word_counts.keys()) | set(spam_word_counts.keys())
    
    ham_log_probs = {}
    spam_log_probs = {}

    for word in vocabulary:
        p_w_given_ham = (ham_word_counts.get(word, SMALL)+a)/(total_words_in_ham+a * len(vocabulary))
        ham_log_probs[word] = np.log(p_w_given_ham)

        p_w_given_spam = (spam_word_counts.get(word, SMALL) + a )/(total_words_in_spam+ a * len(vocabulary)) 
        spam_log_probs[word] = np.log(p_w_given_spam)

    log_ham = np.log(total_ham_emails / (total_spam_emails + total_ham_emails))
    log_spam = np.log(total_spam_emails / (total_spam_emails + total_ham_emails))
    log_prior= np.array([log_ham, log_spam])

    return log_prior, ham_log_probs, spam_log_probs

total_ham_emails, total_spam_emails,ham_word_counts, spam_word_counts =  count_classes_and_words("subset.csv")
#to_log_probabilities(total_ham_emails, total_spam_emails,ham_word_counts, spam_word_counts)

In [79]:
def question_1b():
    total_ham_emails, tital_spam_emails, ham_word_counts, spam_word_counts =  count_classes_and_words("subset.csv")
    log_prior, ham_log_probs, spam_log_probs = to_log_probabilities(total_ham_emails, total_spam_emails,ham_word_counts, spam_word_counts,0)
    print()

In [82]:
def naive_bayes(log_prior, ham_log_probs, spam_log_probs, bag_of_words):
    SMALL = 1e-5
    log_prob = log_prior.copy()

    #update belief
    for word in bag_of_words:
        log_prob[0] += ham_log_probs.get(word, SMALL)
        log_prob[1] += spam_log_probs.get(word, SMALL)
    #print(log_prob)
    return np.argmax(log_prob)
    
def group_2():
    total_ham_emails, total_spam_emails, ham_word_counts, spam_word_counts =  count_classes_and_words("subset.csv")
    log_prior, ham_log_probs, spam_log_probs = to_log_probabilities(total_ham_emails, total_spam_emails,ham_word_counts, spam_word_counts,0)

    message = "Urgent! You have won a free gift."

    message = message.lower().replace("!", "").replace("?", "").replace(".", "").replace(",", "")
    message = message.split()
    label = naive_bayes(log_prior, ham_log_probs, spam_log_probs, message)
    if label == 0:
        print("The message is Ham")
    else:
        print("The message is Spam")
group_2()

The message is Spam


In [83]:
cf = pd.read_csv("datasets/train.csv",sep="\t")
#print(cf.head)

In [95]:
def group1_a1():
    total_ham, total_spam, ham_counts, spam_counts = count_classes_and_words("datasets/train.csv")
    prior_ham_spam, ham_log_probs, spam_log_probs = to_log_probabilities(total_ham, total_spam, ham_counts, spam_counts,0.6)

    total = 0
    correct = 0

    with open("datasets/train.csv") as f:
        for line in f:
            label, message = line.split("\t")
            words = message.split(", ")
            words[-1] = words[-1].strip()

            predicted  = naive_bayes(prior_ham_spam, ham_log_probs, spam_log_probs, words)
            if predicted == int(label):
                correct += 1
            total += 1
    acc = correct / total
    print(f"Accuracy: {acc:.4f}")

group1_a1()

Accuracy: 0.9917


In [103]:
def group1_a2a():
    total_ham1, total_spam1, ham_counts1, spam_counts1 = count_classes_and_words("datasets/train.csv")
    #total_ham2, total_spam2, ham_counts2, spam_counts2 = count_classes_and_words("datasets/test.csv")
    prior_ham_spam1, ham_log_probs1, spam_log_probs1 = to_log_probabilities(total_ham1, total_spam1, ham_counts1, spam_counts1,0)
    #prior_ham_spam2, ham_log_probs2, spam_log_probs2 = to_log_probabilities(total_ham2, total_spam2, ham_counts2, spam_counts2,0)

    total = 0
    correct = 0
    
    with open("datasets/test.csv") as f:
        for line in f:
            label, message = line.split("\t")
            words = message.split(", ")
            words[-1] = words[-1].strip()
            
            predicted  = naive_bayes(prior_ham_spam1, ham_log_probs1, spam_log_probs1, words)
            if predicted == int(label):
                correct += 1
            total += 1
    acc = correct / total
    print(f"Accuracy: {acc:.4f}")

group1_a2a()

Accuracy: 0.9803


group1_a2b) I would report the second accuracy because it is the result of testing the model on unseen data.

In [105]:
def group2_a():
    total_ham1, total_spam1, ham_counts1, spam_counts1 = count_classes_and_words("datasets/train.csv")
    #total_ham2, total_spam2, ham_counts2, spam_counts2 = count_classes_and_words("datasets/test.csv")
    prior_ham_spam1, ham_log_probs1, spam_log_probs1 = to_log_probabilities(total_ham1, total_spam1, ham_counts1, spam_counts1,0.2)
    #prior_ham_spam2, ham_log_probs2, spam_log_probs2 = to_log_probabilities(total_ham2, total_spam2, ham_counts2, spam_counts2,0)

    total = 0
    correct = 0
    
    with open("datasets/test.csv") as f:
        for line in f:
            label, message = line.split("\t")
            words = message.split(", ")
            words[-1] = words[-1].strip()
            
            predicted  = naive_bayes(prior_ham_spam1, ham_log_probs1, spam_log_probs1, words)
            if predicted == int(label):
                correct += 1
            total += 1
    acc = correct / total
    print(f"Accuracy: {acc:.4f}")

group2_a()

Accuracy: 0.9839


In [115]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [117]:
def group3():
    X = []
    y= []
    with open("datasets/dataset.csv") as f:
        for line in f:
            label, message = line.split("\t")
            X.append(message)
            y.append(int(label))
    X_train, X_test,y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
    #print(X_train[:5])
    #print(y_train[:5])
    model = make_pipeline(CountVectorizer(), MultinomialNB(alpha=1))
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test,y_pred)
    print(f"Accuracy: {acc:.4f}")
group3()

Accuracy: 0.9767
