<a href="https://colab.research.google.com/github/KhueNguyen312/Naive-Bayes-Classifier-for-Text-Classification/blob/main/Multinomial_Naive_Bayes_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
import re
import numpy as np

In [None]:
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.corpus import movie_reviews
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
cachedStopWords = stopwords.words("english")

In [None]:
def preprocess_string(text):
  if not isinstance(text,str): text = text['review']
  p_text=re.sub('[^a-z\s]+',' ',text,flags=re.IGNORECASE)
  p_text=re.sub('(\s+)',' ',p_text)
  p_text = ' '.join([wordnet_lemmatizer.lemmatize(word) for word in p_text.split() if word not in cachedStopWords])
  p_text= text.lower()
  return p_text

In [None]:
def load_movie_reviews():
  raw_data = []
  for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
      text = movie_reviews.raw(fileid)
      
      text=re.sub('[^a-z\s]+',' ',text,flags=re.IGNORECASE)
      text=re.sub('(\s+)',' ',text)
      text = ' '.join([wordnet_lemmatizer.lemmatize(word) for word in text.split() if word not in cachedStopWords])
      text= text.lower()
      
      re_dict = {
          'review': text,
          'tag': category
      }
      raw_data.append(re_dict)
  return raw_data

In [None]:
class NaiveBayes:
  def __init__(self,u_classes):
    self.classes = u_classes
    
  def count_frequencies(self,data):
    freq = [dict() for x in range(self.classes.shape[0])]
    
    for item in data:
      for index,cat in enumerate(self.classes):
        if item['tag'] == cat:
          arr = re.split("\s+|-", item['review'])
          for w in arr:
            if w in freq[index]:
              freq[index][w] +=1
            else:
              freq[index][w] = 1
          
    return freq
        
  def train(self,data,labels):
    self.data = data
    self.labels = labels
    
    #convert to numpy array.
    if not isinstance(self.data,np.ndarray): self.data = np.array(self.data)
    if not isinstance(self.labels,np.ndarray): self.labels = np.array(self.labels)
    
    #calculate prior probability of each class p(c)
    self.prob_classes = np.empty(self.classes.shape[0])
    self.bow_dict = self.count_frequencies(data)
    all_words = []
    category_word_count = np.empty(self.classes.shape[0])
    for index,category in enumerate(self.classes):
      self.prob_classes[index] = np.sum(self.labels == category)/float(self.labels.shape[0])
      
      category_word_count[index] = np.sum(list(self.bow_dict[index].values()))+1
      
      #add all words of each category to list all_words
      all_words += self.bow_dict[index].keys()
    
    self.vocab = np.unique(np.array(all_words))
    self.vocab_lenght = self.vocab.shape[0]
    
    self.denoms = np.array([category_word_count[index] + self.vocab_lenght+1 for index,_ in enumerate(self.classes)])
  
  def get_pos_prob(self,sample):
    likelihood_prob = np.zeros(self.classes.shape[0])
    
    for index, category in enumerate(self.classes):
      #This loop computes : for each word w [ count(w|c)+1 ] / [ count(c) + |V| + 1 ]
      for w in sample.split():
        #count of word w in category
        numerator = self.bow_dict[index].get(w,0) + 1
        
        #calculate likelihood prob of word w
        lld_prob = numerator/float(self.denoms[index])
        likelihood_prob[index] += np.log(lld_prob)
    
    #calculate posterior probability
    posterior_prob = np.empty(self.classes.shape[0])
    
    for index,cat in enumerate(self.classes):
      posterior_prob[index] = likelihood_prob[index] + np.log(self.prob_classes[index])
    
    return posterior_prob
    
    
  def predict(self,test_set):
    predictions = []
    
    for sample in test_set:
      p_sample = preprocess_string(sample)
      
      #get prob of this example for all classes
      pos_prob = self.get_pos_prob(p_sample)
      
      predictions.append(self.classes[np.argmax(pos_prob)])
    return np.array(predictions)
    

In [None]:
dataset = np.array(load_movie_reviews())
np.random.shuffle(dataset)

train_data = dataset[:1500]
test_data = dataset[1500:]

train_labels = [item['tag'] for item in train_data]
test_labels = [item['tag'] for item in test_data]

train_labels[1:10]

['neg', 'pos', 'pos', 'neg', 'neg', 'neg', 'pos', 'neg', 'neg']

In [None]:
nb = NaiveBayes(np.unique(train_labels))
print("---------------Trainning Model------------------")
nb.train(train_data,train_labels)
print("---------------Trainning Completed------------------")

---------------Trainning Model------------------
---------------Trainning Completed------------------


In [None]:
print("---------------Test Model------------------")
p_test = nb.predict(test_data) 

test_acc = np.sum(p_test==test_labels)/float(len(test_labels)) 

print ("Test Set Examples: ",len(test_labels))
print ("Test Set Accuracy: ",test_acc*100,"%")

---------------Test Model------------------
Test Set Examples:  500
Test Set Accuracy:  93.60000000000001 %


In [None]:
print("---------------Test Model------------------")
p_test = nb.predict(test_data) 

test_acc = np.sum(p_test==test_labels)/float(len(test_labels)) 

print ("Test Set Examples: ",len(test_labels))
print ("Test Set Accuracy: ",test_acc*100,"%")

---------------Test Model------------------
Test Set Examples:  500
Test Set Accuracy:  84.39999999999999 %


In [None]:
nb.predict(['it worthy to watch'])

array(['pos'], dtype='<U3')

In [None]:
nb.predict(['yeah'])

array(['neg'], dtype='<U3')