In [None]:
import nltk
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
from collections import Counter, defaultdict
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords 
import math
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix


nltk.download("movie_reviews")
nltk.download("punkt")
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

###############################################
def cross_validation(instances, labels, k, train_pred_func):
	golden_labels = []
	pred_labels = []
	for fold in range(k):
		training_instances = []
		training_labels = []
		test_instances = []
		test_labels = []
		for i in range(len(instances)):
			if i % k == fold:
				test_instances.append(instances[i])
				test_labels.append(labels[i])
			else:
				training_instances.append(instances[i])
				training_labels.append(labels[i])
		pred_labels += train_pred_func(training_instances, training_labels, test_instances)
		golden_labels += test_labels
		#print(pred_labels)
		#print(golden_labels)
	print("Accuracy: %.4f\nPrecision: %.4f\nRecall: %.4f\nF-score: %.4f" % (
		accuracy_score(golden_labels, pred_labels), 
		precision_score(golden_labels, pred_labels), 
		recall_score(golden_labels, pred_labels), 
		f1_score(golden_labels, pred_labels)))
	print(confusion_matrix(golden_labels, pred_labels))

#############################################
def train_then_predict_mynbc(training_instances, training_labels, test_instances):
	clf = MyNBC()
	clf.train(training_instances, training_labels)  #only do the statistics and dictionary set,no calculate
	pred = []
	for test_instance in test_instances:
		pred.append(clf.predict(test_instance))  #predict every single article #output zero or 1
	return pred

###############################################
def build_dataset():
	labels = []
	instances = []
	for label in movie_reviews.categories():
		for fileid in movie_reviews.fileids(label):
			instances.append(movie_reviews.raw(fileid))
			if label == 'pos':
				labels.append(1)
			else:
				labels.append(0)
	return instances, labels
################################################

class MyNBC:
	'''Define your own feature extraction function'''
	def __init__(self):
		self.k = 0.025  # Smoothing factor  #ok
		self.feature_table = set()
		self.y_counts = Counter()
		self.x_y_counts = defaultdict(lambda: Counter())
		self.num_instances = 0
		self.stopwords=set(stopwords.words('english'))
		self.stemmer = PorterStemmer()

		#self.avelen=0

	def clean(self, string):

		# and not word.isdigit()
		string = string.replace("@"," ")
		string = string.replace("&"," ") 
		string = string.replace(".","")
		string = string.replace(",","")
		string = string.replace("!","")
		string = string.replace("?","")
		string = string.replace("/"," ") 
		string = string.replace("|"," ") 
		string = string.replace("\"","") 
		string = string.replace("~"," ") 
		string = string.replace("%"," ")
		string = string.replace("*"," ")
		string = string.replace("#"," ")
		string = string.replace("+"," ")
		string = string.replace("$","")
		string = string.replace(":","")
		#string = string.replace("_"," ")
		string = string.replace("\s+"," ")
		string = string.lower()

		words=string.split(" ")
		nostop=[word for word in words if word not in self.stopwords]

		#####################
		#nodigit=[]

		#for nd in nostop:
		#	result = ''.join(i for i in nd if not i.isdigit())
		#	nodigit.append(result)
		#######################

		string=" ".join(nostop)

		string = string.replace("-"," ") ##
		string = string.replace("'"," ")

		words = word_tokenize(string)
		finallist=[self.stemmer.stem(word) for word in words]

		return finallist


	def extract_features(self, instance) -> set:

		clean_wlist=self.clean(instance)
		tags = nltk.pos_tag(clean_wlist)

		features = set()   #how many kinds of word   #the latter part is bigram
		for w, t in tags:
			# Focus on only adjectives (J), adverbs (R), verbs (V), and nouns (N).
			if t[0] in {'J', 'R', 'V', 'N'}:    
				features.add(w)    #do not add word that the set already has

		#create a bigram by the format bigram_cat_dog and put it in feature
		#ok
		for i in range(len(clean_wlist) - 1):
			features.add("bigram_%s_%s" % (clean_wlist[i], clean_wlist[i+1]))   
		
		return features

	def train(self, instances, labels):
		for instance, label in zip(instances, labels):
			self.y_counts[label] += 1   #there are only two labels #count how many articles in each label

			#the last part of the features is bigram
			#there are only two labels  #just count for every kind of word or bigrams in article
			#for a single article
			#f is one of the element in the set of 'the' article
			#so, overall we count the appearances of the word amomg many articles in the given label 
			#regardless of the counts of the word in each article 
			for f in self.extract_features(instance):
				self.x_y_counts[f][label] += 1       
				self.feature_table.add(f)  #a set #summarizes the kind of word among articles regardless of label
		self.num_instances = len(instances)  #how many articles regardless of label
		# Reduce less frequent features with a threshold of 5 occurrences.

		record=[]
		for f in self.x_y_counts:
			if sum(self.x_y_counts[f].values()) < 5:
				#ok
				#didn't delete the x_y_count
				record.append(f)
				self.feature_table.remove(f)

		for i in record:
			self.x_y_counts.pop(i)
		print("Number of features: %d" % len(self.feature_table))

		#self.average_article_len(instances)

	#def average_article_len(self, instances):
	#	acc_len=0

	#	for arc in instances:
	#		acc_len+=len(arc)

	#	self.avelen=acc_len/len(instances)


	def smooth_prob(self, word, label):        #ok
		#for a given label, calculate the probability of the appearances of word searched
		return (self.x_y_counts[word][label] + self.k) / (self.y_counts[label] + 2 * self.k)

	def predict(self, instance):
		y_probs = Counter()
		features = self.extract_features(instance)   #return the set of the test article  #use test data
		
		#len(instance)/self.avelen
		factor=0.45
		
		for y in self.y_counts:   #y is 0 or 1 because only 2 label in the counter
			y_probs[y] = math.log(self.y_counts[y] / self.num_instances)  #the probability of the label

			#argmax [log(P(y))+ sigma(log p(xi\y))
			#didn't consider the count of the word in a single article
			#may only calculate idf
			for word in self.feature_table:     
				prob = self.smooth_prob(word, y)
				if word in features:
					y_probs[y] += math.log(prob)          
				else:
					#terms which don't appear also provide information 
					#may also due to the length of article
					#0.45
					y_probs[y] += (factor)*math.log(1.0 - prob) 
			##########################################
		#choose the class with higher probability
		return y_probs.most_common(1)[0][0]


######################################################
X,Y= build_dataset()
cross_validation(X, Y, 5, train_then_predict_mynbc)
#########################################################



