In [36]:
# Mohammed Ahmed Zakiuddin
# 1001675091

import numpy as np
import pandas as pd
import scipy.stats as stats

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [37]:
class NaiveBayesClassifier():
  '''
  Bayes Theorem Formula
  P(Y|X) = P(X|Y) * P(Y) / P(X)
  '''

  def calc_prior(self, features, target):
    '''
    prior probability P(y)
    '''

    self.prior = (features.groupby(target).apply(lambda x: len(x))/self.rows).to_numpy()
    return self.prior
  
  def calc_stats(self, features, target):
    '''
    calc mean, variance  for each column
    '''
    self.mean = features.groupby(target).apply(np.mean).to_numpy()
    self.var = features.groupby(target).apply(np.var).to_numpy()

    self.mean += 0.01
    self.var += 0.01

    return self.mean, self.var

  def calc_posterior(self, x):

    # Calculating the posterior probability for each class

    posterior = []
    for i in range(self.count):
      prior = np.log(self.prior[i])
      conditional = np.sum(np.log(self.gaussian_den(i, x)))
      post = prior + conditional
      posterior.append(post)
    
    return self.class_target[np.argmax(posterior)]
  
  def gaussian_den(self, class_idx, x):

    # Calculate the probability from the gaussian density function (normal dis)

    mean = self.mean[class_idx]
    var = self.var[class_idx]
    numerator = np.exp((-1/2)*((x-mean)**2)/(2*var))
    den = np.sqrt(2* np.pi *var)
    prob = numerator/den
    return prob
    
  def fit(self, features, target):

    self.class_target = np.unique(target)
    self.count = len(self.class_target)  
    self.feature_num = features.shape[1]
    self.rows = features.shape[0] 
    self.calc_stats(features, target)
    self.calc_prior(features, target)

  def predict(self, features):
    predictions = [self.calc_posterior(f) for f in features.to_numpy()]
    return predictions
  
  def accuracy(self, y_test, y_pred):
    accuracy = np.sum(y_test == y_pred)/len(y_test)
    return accuracy 
  

In [38]:
def calc_prior(features, target):

    '''
    prior probability P(y)
    '''
    rows = features.shape[0]
    prior = (features.groupby(target).apply(lambda x: len(x))/rows).to_numpy()
    return prior

features = []
target = []

with open("customerReviews.txt") as file_input:
    for line in file_input.readlines():
        t = line.split(",")[-1].split()[0]
        f = ' '.join(line.split(",")[:-1])
        features.append(f)
        target.append(t)

df = pd.DataFrame(list(zip(features, target)), columns=['Reviews', 'Target'])

Review, Label = df.iloc[:, 0], df.iloc[:,-1]

check = CountVectorizer()
Review = check.fit_transform(Review)

Review = pd.DataFrame(Review.toarray(), columns = check.get_feature_names_out())

X_train, X_test, y_train, y_test = train_test_split(Review, Label, test_size = 0.3, random_state = 0)

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Convert the DataFrame to a list of tuples this is a dictionary
data = list(df.to_records(index=False))

# Separated the sentences and labels into separate lists
sentences = [d[0] for d in data]
labels = [d[1] for d in data]

# Split the sentences into words
words = [sentence.split() for sentence in sentences]

pos_word_freq = {}
neg_word_freq = {}

# First checks whether the label of the review is positive or negative.
# Then, loops through each word in the review and update the corresponding dictionary with the frequency count for that word. 
# If the word is not already in the dictionary, you add it with a count of 1.

for i, sentence in enumerate(sentences):
  if labels[i] == 'positive':
    # process positive reviews
    for word in words[i]:
      if word in pos_word_freq:
        pos_word_freq[word] += 1
      else:
        pos_word_freq[word] = 1
  elif labels[i] == 'negative':
    #process negative reviews
    for word in words[i]:
      if word in neg_word_freq:
        neg_word_freq[word] += 1
      else:
        neg_word_freq[word] = 1

print('Top 10 most frequent words in positive reviews:')
sorted_pos_word_freq = sorted(pos_word_freq.items(), key=lambda x: x[1], reverse=True)
for word, freq in sorted_pos_word_freq[:10]:
    print(f'{word}: {freq}')

print('Top 10 most frequent words in negative reviews:')
sorted_neg_word_freq = sorted(neg_word_freq.items(), key=lambda x: x[1], reverse=True)
for word, freq in sorted_neg_word_freq[:10]:
    print(f'{word}: {freq}')


# Calculate the probability of each word given that it is positive and given that it is negative
total_positive_words = sum(pos_word_freq.values())
total_negative_words = sum(neg_word_freq.values())
positive_probabilities = {word: count/total_positive_words for word, count in pos_word_freq.items()}
negative_probabilities = {word: count/total_negative_words for word, count in neg_word_freq.items()}

# Accumulate the probabilities for each label
positive_probability_sum = 0
negative_probability_sum = 0

prior = calc_prior(Review, Label)

likelyhood = []

prior_positive = 1
prior_negative = 1

for i in range(len(X_test)):
  sentence_probability_positive = 1
  sentence_probability_negative = 1

  for word in X_test.iloc[i]:
    if word in positive_probabilities:
      sentence_probability_positive *= positive_probabilities[word]
    if word in negative_probabilities:
      sentence_probability_negative *= negative_probabilities[word]

  prior_positive *= sentence_probability_positive
  prior_negative *= sentence_probability_negative

  if prior_positive > prior_negative:
    likelyhood.append(prior_positive)
  else:
    likelyhood.append(prior_negative)

def accuracy(y_test, y_pred):

  y_test_num = y_test.replace({'positive': 1, 'negative': 0})
  y_pred_num = pd.Series(likelyhood).apply(lambda x: 1 if x > 0.5 else 0)
  accurate = np.sum(y_test_num == y_pred_num) / len(y_test_num)

result = accuracy(y_test, likelyhood)

Top 10 most frequent words in positive reviews:
service: 32
is: 26
I: 24
product: 24
this: 23
This: 21
a: 18
The: 18
I'm: 14
with: 13
Top 10 most frequent words in negative reviews:
The: 57
service: 41
is: 41
product: 26
not: 26
as: 24
a: 17
was: 16
I: 11
to: 11


In [39]:
NB = NaiveBayesClassifier()

NB.fit(X_train, y_train)

# 3. Using the trained classifier to predict the sentiment of each review in the test set.
predictions = NB.predict(X_test)

# 4. Computes the accuracy of the classifier
print(NB.accuracy(y_test, predictions))

predict = {'I had a terrible experience with this company', 'This is a great company with excellent customer service', 'I was really disappointed with this product', 'The service is too expensive for what it offers'}

transformed_data = check.transform(predict)
predict_df = pd.DataFrame(transformed_data.toarray(), columns = check.get_feature_names_out())
predictions = NB.predict(predict_df)
print(predictions)

0.9069767441860465
['negative', 'positive', 'positive', 'negative']
