# COMPSCI589 Homework 2
##### Chang Liu, 3.6.2022

## Programming: Multinomial Naive Bayes for Document Classification

In [1]:
# utils.py

import re
import os
import glob
import random
from nltk.corpus import stopwords
import nltk

REPLACE_NO_SPACE = re.compile("[._;:!`¦\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
nltk.download('stopwords')  

def preprocess_text(text):
	stop_words = set(stopwords.words('english'))
	text = REPLACE_NO_SPACE.sub("", text)
	text = REPLACE_WITH_SPACE.sub(" ", text)
	text = re.sub(r'\d+', '', text)
	text = text.lower()
	words = text.split()
	return [w for w in words if w not in stop_words]

def load_training_set(percentage_positives, percentage_negatives):
	vocab = set()
	positive_instances = []
	negative_instances = []
	for filename in glob.glob('train/pos/*.txt'):
		if random.random() > percentage_positives:
			continue
		with open(os.path.join(os.getcwd(), filename), 'r') as f:
			contents = f.read()
			contents = preprocess_text(contents)
			positive_instances.append(contents)
			vocab = vocab.union(set(contents))
	for filename in glob.glob('train/neg/*.txt'):
		if random.random() > percentage_negatives:
			continue
		with open(os.path.join(os.getcwd(), filename), 'r') as f:
			contents = f.read()
			contents = preprocess_text(contents)
			negative_instances.append(contents)
			vocab = vocab.union(set(contents))	
	return positive_instances, negative_instances, vocab

def load_test_set(percentage_positives, percentage_negatives):
	positive_instances = []
	negative_instances = []
	for filename in glob.glob('test/pos/*.txt'):
		if random.random() > percentage_positives:
			continue
		with open(os.path.join(os.getcwd(), filename), 'r') as f:
			contents = f.read()
			contents = preprocess_text(contents)
			positive_instances.append(contents)
	for filename in glob.glob('test/neg/*.txt'):
		if random.random() > percentage_negatives:
			continue
		with open(os.path.join(os.getcwd(), filename), 'r') as f:
			contents = f.read()
			contents = preprocess_text(contents)
			negative_instances.append(contents)
	return positive_instances, negative_instances
	

[nltk_data] Downloading package stopwords to /Users/von/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
# run.py
# from utils import *
import pprint
from collections import Counter
import math

def naive_bayes(smooth=True, log_likelihood=True):
	
	debug_v = 0.1 #0.0004
	percentage_positive_instances_train = debug_v
	percentage_negative_instances_train = debug_v

	percentage_positive_instances_test  = debug_v
	percentage_negative_instances_test  = debug_v
	
	(pos_train, neg_train, vocab) = load_training_set(percentage_positive_instances_train, percentage_negative_instances_train)
	(pos_test,  neg_test)         = load_test_set(percentage_positive_instances_test, percentage_negative_instances_test)

	print("Number of positive training instances:", len(pos_train))
	print("Number of negative training instances:", len(neg_train))
	print("Number of positive test instances:", len(pos_test))
	print("Number of negative test instances:", len(neg_test))

	with open('vocab.txt','w') as f:
		for word in vocab:
			f.write("%s\n" % word)
	print("Vocabulary (training set):", len(vocab))

	vocab_size = len(vocab)
	# Calculate the prior probabilities
	prior_pos = len(pos_train) / (len(pos_train) + len(neg_train))
	prior_neg = len(neg_train) / (len(pos_train) + len(neg_train))

	print("Prior probability of positive class:", prior_pos)
	print("Prior probability of negative class:", prior_neg)

	# Build the likelihoods table
	train_dict = {}
	for word in vocab:
		train_dict[word] = 0;
	
	likelihoods = {}
	likelihoods["pos"] = train_dict.copy()
	likelihoods["pos"].update( dict(Counter(sum(pos_train, []))) )
	likelihoods["neg"] = train_dict.copy()
	likelihoods["neg"].update( dict(Counter(sum(neg_train, []))) )

	word_count_pos = sum(likelihoods["pos"].values()) 
	word_count_neg = sum(likelihoods["neg"].values())

	model_pos = {}
	model_neg = {}

	# calculate probablity, apply lapalce smoothing 
	for word in likelihoods["pos"]:
		if smooth:
			model_pos[word] = (likelihoods["pos"][word] + 1) / (word_count_pos + vocab_size) 
		else:
			model_pos[word] = likelihoods["pos"][word] / word_count_pos

	for word in likelihoods["neg"]:
		if smooth:
			model_neg[word] = (likelihoods["neg"][word] + 1) / (word_count_neg + vocab_size)
		else:
			model_neg[word] = likelihoods["neg"][word] / word_count_neg

	pos_test_correct = 0
	for doc in pos_test:
		doc_dict = dict(Counter(doc))
		doc_p_pos = math.log(prior_pos) if log_likelihood else prior_pos
		doc_p_neg = math.log(prior_neg) if log_likelihood else prior_neg
		for word in doc_dict:
			if word in model_pos: 
				# it should also exist in the negative vacabulary
				if log_likelihood:
					doc_p_pos += math.log(model_pos[word]) if model_pos[word] != 0 else 0
					doc_p_neg += math.log(model_neg[word]) if model_neg[word] != 0 else 0
				else: 
					doc_p_pos *= model_pos[word]
					doc_p_neg *= model_neg[word]
		if (doc_p_pos > doc_p_neg):
			pos_test_correct += 1

	neg_test_correct = 0
	for doc in neg_test:
		doc_dict = dict(Counter(doc))
		doc_p_pos = math.log(prior_pos) if log_likelihood else prior_pos
		doc_p_neg = math.log(prior_neg) if log_likelihood else prior_neg
		for word in doc_dict:
			if word in model_pos:
				# it should also exist in the negative vacabulary
				if log_likelihood:
					doc_p_pos += math.log(model_pos[word]) if model_pos[word] != 0 else 0
					doc_p_neg += math.log(model_neg[word]) if model_neg[word] != 0 else 0
				else:
					doc_p_pos *= model_pos[word]
					doc_p_neg *= model_neg[word]
		if (doc_p_pos < doc_p_neg):
			neg_test_correct += 1

	print("correct Pos Test: ", pos_test_correct);
	print("correct Neg Test: ", neg_test_correct);
	
	accuracy = (pos_test_correct + neg_test_correct) / (len(pos_test) + len(neg_test))
	precision = pos_test_correct / (pos_test_correct + neg_test_correct)
	print(accuracy)
	return accuracy, precision


naive_bayes(smooth=True, log_likelihood=True)



Number of positive training instances: 1274
Number of negative training instances: 1228
Number of positive test instances: 1270
Number of negative test instances: 1252
Vocabulary (training set): 29714
Prior probability of positive class: 0.5091926458832934
Prior probability of negative class: 0.4908073541167066
correct Pos Test:  1012
correct Neg Test:  1076
0.8279143536875495


(0.8279143536875495, 0.4846743295019157)