# MOVIE REVIEW ANALYSIS



In [1]:
import math
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score, f1_score

In [2]:
#IMPLEMENT THE NAIVE BAYES CLASSIFIER 
class NaiveBayesClassifier:
    def __init__(self):
        self.positive_reviews = []
        self.negative_reviews = []
        self.positive_words = {}
        self.negative_words = {}
        self.total_positive_reviews = 0
        self.total_negative_reviews = 0

    def train(self, dataset):
        for _, row in dataset.iterrows():
            review = row['review']
            sentiment = row['sentiment']
            
            if sentiment == 'positive':
                self.positive_reviews.append(review)
                self.total_positive_reviews += 1
            else:
                self.negative_reviews.append(review)
                self.total_negative_reviews += 1

            words = review.split()
            for word in words:
                if sentiment == 'positive':
                    self.positive_words[word] = self.positive_words.get(word, 0) + 1
                else:
                    self.negative_words[word] = self.negative_words.get(word, 0) + 1
                    
    def calculate_probability(self, words, sentiment):
        if sentiment == 'positive':
            total_reviews = self.total_positive_reviews
            review_count = len(self.positive_reviews)
            word_count = sum(self.positive_words.values())
            words_dict = self.positive_words
        else:
            total_reviews = self.total_negative_reviews
            review_count = len(self.negative_reviews)
            word_count = sum(self.negative_words.values())
            words_dict = self.negative_words

        probability = math.log(review_count / total_reviews)
        for word in words:
            word_occurrences = words_dict.get(word, 0) + 1
            word_probability = word_occurrences / (word_count + len(words_dict))
            probability += math.log(word_probability)

        return probability

    def predict(self, review):
        words = review.split()

        positive_probability = self.calculate_probability(words, 'positive')
        negative_probability = self.calculate_probability(words, 'negative')

        if positive_probability > negative_probability:
            return 'positive'
        else:
            return 'negative'    


In [3]:
# Load the dataset
dataset = pd.read_csv("C:/Users/manza/Downloads/dataset.csv")


In [4]:
# Preprocess the text data
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)  # Join the list of words back into a string

dataset['review'] = dataset['review'].apply(preprocess_text)

In [5]:
# Create an instance of the Word2Vec model
word2vec_model = Word2Vec(sentences=dataset['review'].apply(word_tokenize), vector_size=100, window=5, min_count=1, sg=0)

In [6]:
# Split the dataset into train and test sets
train_size = int(0.8 * len(dataset))
train_dataset = dataset[:train_size]
test_dataset = dataset[train_size:]

In [7]:
# Create an instance of the classifier and train it
classifier = NaiveBayesClassifier()
classifier.train(dataset[['review', 'sentiment']])  # Pass only the relevant columns

In [8]:
# Evaluate the model on the test set
true_labels = test_dataset['sentiment']
predicted_labels = [classifier.predict(preprocess_text(review)) for review in test_dataset['review']]


In [9]:
#calculating the accuracy of the model
accuracy = accuracy_score(true_labels, predicted_labels)


In [10]:
# Input a review and predict sentiment
review = input("Give us your review:")
preprocessed_review = preprocess_text(review)
embedding_vector = sum([word2vec_model.wv[word] for word in word_tokenize(preprocessed_review) if word in word2vec_model.wv])
prediction = classifier.predict(preprocessed_review)
print(f"Original Review: {review}")
print(f"Preprocessed Review: {preprocessed_review}")
print(f"Sentiment: {prediction}")

print(f"Accuracy: {accuracy:.2f}")


Give us your review:The movie was bad i did not like the movie
Original Review: The movie was bad i did not like the movie
Preprocessed Review: movie bad like movie
Sentiment: negative
Accuracy: 0.97
