In [39]:
# APPLIES TO BOTH OF THE IMPLEMENTATIONS

# Importing necessary libraries 
import os
import pandas as pd
import re
import nltk
import math 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

nltk.download('stopwords')
nltk.download('punkt') 
# Specifying the path as the second argument 
nltk.download('stopwords', download_dir=r'C:\Users\aze\Desktop\application_of_Naive_Bayes')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aze\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aze\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aze\Desktop\application_of_Naive_Bayes...
[nltk_data]   Package stopwords is already up-to-date!


True

In [40]:
# APPLIES TO BOTH OF THE IMPLEMENTATIONS

# Function to load movie reviews from a directory
def load_reviews(directory):
    reviews = []  # List to store review texts
    labels = []  # List to store review labels (pos/neg)

    # Looping over the directories 'pos' and 'neg'
    for label in ["pos", "neg"]:
        directory_name = os.path.join(directory, label)  # Path to the sub-directory
        # Iterating over the files in the directory
        for filename in os.listdir(directory_name):
            # Checking if the file is a text file
            if filename.endswith(".txt"):
                # Opening the file for reading with proper encoding
                with open(os.path.join(directory_name, filename), 'r', encoding='utf-8') as file:
                    text = file.read()  # Reading the content of the file
                    reviews.append(text)  # Appending the text to the reviews list
                    labels.append(label)  # Appending the label to the labels list

    # Returning the list of reviews and their corresponding labels
    return reviews, labels

In [41]:
# APPLIES TO BOTH OF THE IMPLEMENTATIONS

reviews, labels = load_reviews(r'C:\Users\aze\Desktop\application_of_Naive_Bayes\txt_sentoken')

In [42]:
# APPLIES TO BOTH OF THE IMPLEMENTATIONS

# Converting the lists of reviews and labels into a DataFrame for easier manipulation
data = pd.DataFrame({
    'review': reviews,
    'label': labels
})


# Assuming 'data' is your DataFrame - dataset inspection 
print(data.head())  # Displaying the first few reviews
print(data.isnull().sum())  # Checking for missing values
print(data['review'].apply(len).describe())  # Analyzing review lengths

                                              review label
0  films adapted from comic books have had plenty...   pos
1  every now and then a movie comes along from a ...   pos
2  you've got mail works alot better than it dese...   pos
3   " jaws " is a rare film that grabs your atten...   pos
4  moviemaking is a lot like being the general ma...   pos
review    0
label     0
dtype: int64
count     2000.000000
mean      3893.002000
std       1712.425852
min         91.000000
25%       2737.750000
50%       3622.500000
75%       4720.250000
max      14957.000000
Name: review, dtype: float64


In [43]:
# APPLIES TO BOTH OF THE IMPLEMENTATIONS

# Setting of English stopwords
english_stopwords = set(stopwords.words('english'))

In [44]:
# APPLIES TO BOTH OF THE IMPLEMENTATIONS

# Function to preprocess text data with a tokenizer
def preprocess_text(text):
    # Removing non-alphanumeric characters
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Tokenizing text
    tokens = word_tokenize(text)
    # Removing stopwords
    tokens = [token for token in tokens if token not in english_stopwords]
    # Rejoining all tokens back into a single string
    text = ' '.join(tokens)
    return text

In [45]:
# APPLIES TO BOTH OF THE IMPLEMENTATIONS

# Preprocessing each review in the dataset
preprocessed_reviews = [preprocess_text(review) for review in reviews]

In [46]:
# APPLIES TO MANUAL IMPLEMENTATION ONLY 

# Separating the pre-processed reviews in positive and negative lists based on the corresponding labels 
positive_reviews = []
negative_reviews = []

for review, label in zip(preprocessed_reviews, labels):
    if label == 'pos':  
        positive_reviews.append(review)
    elif label == 'neg': 
        negative_reviews.append(review)

In [47]:
# APPLIES TO LIBRARY IMPLEMENTATION ONLY

# 'labels' is a list of corresponding labels (e.g., 'pos' or 'neg').

# splitting the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(preprocessed_reviews, labels, test_size=0.2)


# Initialize the CountVectorizer with the custom tokenizer
vectorizer = CountVectorizer()

# # converting the text documents to a matrix of token counts
# vectorizer = CountVectorizer()
X_train_counters = vectorizer.fit_transform(X_train)
X_test_counters = vectorizer.transform(X_test)

# training a Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_counters, Y_train)

# making predictions on the test set
Y_predicted = clf.predict(X_test_counters)

# evaluating the classifier
accuracy = accuracy_score(Y_test, Y_predicted)
report = classification_report(Y_test, Y_predicted)

print(f'Accuracy: {accuracy}')
print(report)


Accuracy: 0.8025
              precision    recall  f1-score   support

         neg       0.82      0.78      0.80       203
         pos       0.78      0.83      0.80       197

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.80      0.80      0.80       400



In [48]:
# APPLIES TO MANUAL IMPLEMENTATION ONLY

# Creating dictionaries to count the frequency of each token among the reviews.
# Each word is counted only once per review to avoid bias towards longer reviews.
positive_dictionary = {}
negative_dictionary = {}

true_labels = []
predicted_labels = []
 
# Specifying the number of training data to take 
# Here we calculate 80% of the data for training
positive_count_train = round(0.8 * len(positive_reviews))
negative_count_train = round(0.8 * len(negative_reviews))

# Counter for positive and negative tokens in the training data
positive_token_count = 0
negative_token_count = 0

# Total count of reviews for positive and negative training data
total_count = positive_count_train + negative_count_train

# Splitting the the positive reviews into training and testing sets (with a random seed)
positive_train, positive_test = train_test_split(
    positive_reviews, test_size=0.2
)


for review in positive_train:  
  tokens = review.split()
  # Counting the total number of positive tokens
  positive_token_count += len(tokens)
  # 'used' is a list to keep track of tokens already seen in the current review
  used = []
  for token in tokens:
    if token == "":
      continue
    if token not in positive_dictionary:
      positive_dictionary[token] = 0
    # Incrementing the count for each token only once per review
    if token not in used:
      positive_dictionary[token] += 1
      used.append(token)

# Splitting the the negative reviews into training and testing sets (with a random seed)
negative_train, negative_test = train_test_split(
    negative_reviews, test_size=0.2
)

for review in negative_train:
  tokens = review.split()
  # Counting the total number of negative tokens
  negative_token_count += len(tokens)
  # 'used' is a list to keep track of tokens already seen in the current review
  used = []
  for token in tokens:
    if token == "":
      continue
    if token not in negative_dictionary:
      negative_dictionary[token] = 0
    # Incrementing the count for each token only once per review
    if token not in used:
      negative_dictionary[token] += 1
      used.append(token)


# Testing the classifier with positive test data
correct_positive, total_positive = 0, len(positive_test) 
TP_pos, FN_pos = 0, 0
for test_review in positive_test:
  # Calculating the prior probability of a review being positive
  probability_positive = math.log(positive_count_train / total_count)
  # Calculating the prior probability of a review being negative
  probability_negative = math.log(negative_count_train / total_count)
  tokens = test_review.split()
  for token in tokens:
    if token == "":
      continue
    # Calculating the likelihood of each token for positive
    if token in positive_dictionary:
      probability_positive += math.log((positive_dictionary[token] + 1) / (positive_token_count + len(positive_dictionary)))
    else:
      probability_positive += math.log(1 / (positive_token_count + len(positive_dictionary)))

    # Calculating the likelihood of each token for negative
    if token in negative_dictionary:
      probability_negative += math.log((negative_dictionary[token] + 1) / (negative_token_count + len(negative_dictionary)))
    else:
      probability_negative += math.log(1 / (negative_token_count + len(negative_dictionary)))
     # Check if prediction is correct
    # if (probability_positive < probability_negative):
    #   TP_pos += 1  # True Positive: correctly identified negative
    # else:
    #  FN_pos += 1  # False Negative: incorrectly identified negative as positive
    true_labels.append('pos')  # The true label for these reviews is 'pos'
    predicted_label = 'pos' if probability_positive > probability_negative else 'neg'
    predicted_labels.append(predicted_label)
  # If the calculated probability for positive is greater, the review is classified as positive  
  if (probability_positive > probability_negative):
    correct_positive += 1


# Testing the classifier with negative test data
correct_negative, total_negative = 0, len(negative_test)
for test_review in negative_test:
  # Calculating the prior probability of a review being positive
  probability_positive = math.log(positive_count_train / total_count)
  # Calculaing the prior probability of a review being negative
  probability_negative = math.log(negative_count_train / total_count)

  tokens = test_review.split()
  for token in tokens:
    if token == "":
      continue
    # Calculating the likelihood of each token for positive
    if token in positive_dictionary:
      probability_positive += math.log((positive_dictionary[token] + 1) / (positive_token_count + len(positive_dictionary)))
    else:
      probability_positive += math.log(1 / (positive_token_count + len(positive_dictionary)))
    # Calculating the likelihood of each token for negative
    if token in negative_dictionary:
      probability_negative += math.log((negative_dictionary[token] + 1) / (negative_token_count + len(negative_dictionary)))
    else:
      probability_negative += math.log(1 / (negative_token_count + len(negative_dictionary)))

    true_labels.append('neg')  # The true label for these reviews is 'neg'
    predicted_label = 'neg' if probability_positive < probability_negative else 'pos'
    predicted_labels.append(predicted_label)

  # If the calculated probability for negative is greater, the review is classified as negative  
  if (probability_positive < probability_negative):
    correct_negative += 1

# Calculating total number of correct predictions and total number of predictions
total_correct = correct_negative + correct_positive
total_predictions = total_negative + total_positive

# Generating the classification report
# Now that we have the true and predicted labels, we can generate the classification report
report = classification_report(true_labels, predicted_labels, target_names=['neg', 'pos'])


# Calculating and print the accuracy of the classifier
print("Accuracy: " + str(total_correct / total_predictions))
print(report)


Accuracy: 0.77
              precision    recall  f1-score   support

         neg       0.66      0.82      0.73     67129
         pos       0.80      0.63      0.71     76907

    accuracy                           0.72    144036
   macro avg       0.73      0.73      0.72    144036
weighted avg       0.73      0.72      0.72    144036

