In [27]:
import os 
import tarfile
import pyprind
import pandas as pd
from packaging import version
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfTransformer
# import
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay


In [None]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [None]:
df = pd.read_csv('movie_data.csv')
df.count()

In [None]:
x = df["review"].values
y = df["sentiment"].values

count = CountVectorizer()


In [None]:
bag_of_words = count.fit_transform(x)

# get vocab of (Unique words)
feature_names = count.get_feature_names_out()

# Sum the ounts for each word across all documents
word_counts = bag_of_words.sum(axis=0).A1 # (A1 converts everything attached into 1D array)

# Creat a dictionary mapping words to thier counts
word_counts_dict = dict(zip(feature_names, word_counts))

print(word_counts_dict)

In [None]:
# Define preprocessing functions
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # POS tagging
    pos_tags = nltk.pos_tag(tokens)
    
    # Lemmatization and Stemming
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    processed_tokens = []
    for word, tag in pos_tags:
        lemmatized_word = lemmatizer.lemmatize(word, get_wordnet_pos(tag))
        stemmed_word = stemmer.stem(lemmatized_word)
        processed_tokens.append(stemmed_word)
    
    return ' '.join(processed_tokens)



In [None]:
df['processed_review'] = df['review'].apply(preprocess_text)

In [None]:
# Split the data into training and test sets
X = df['processed_review']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)



In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Test accuracy: {accuracy}')



In [None]:
# Visualize model performance with a confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Negative', 'Positive'])
disp.plot(cmap=plt.cm.Blues)
plt.show()



In [None]:
# Function to predict sentiment of a given text
def predict_sentiment(text, vectorizer, model):
    processed_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([processed_text])
    pred = model.predict(vectorized_text)
    return 'Positive' if pred[0] == 1 else 'Negative'



In [None]:
# Example sentences
examples = [
    "This movie was fantastic! I really enjoyed it.",
    "I hated this movie. It was terrible.",
    "It was an average movie, not too good, not too bad.",
    "The plot was boring and the acting was bad.",
    "Absolutely loved the cinematography and story."
]



In [None]:
# Display predictions
for sentence in examples:
    sentiment = predict_sentiment(sentence, vectorizer, model)
    print(f"Review: {sentence}\nPredicted Sentiment: {sentiment}\n")