In [5]:
# Importing required libraries
import pandas as pd
import numpy as np
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
# Download required NLTK resources
nltk.download('vader_lexicon', quiet=True)
nltk.download('stopwords', quiet=True)

True

In [75]:
df = pd.read_csv("C:/Users/Lenovo/Downloads/movie_reviews (1).csv")

Question 1: Data Analysis and Preprocessing (25 pts)

In [78]:
print("Number of samples: ", len(df))

Number of samples:  40000


In [80]:
# Display class distribution
print("\nClass Dsitribution")
print(df['label'].value_counts())


Class Dsitribution
label
0    20019
1    19981
Name: count, dtype: int64


In [82]:
print(df.info)

<bound method DataFrame.info of                                                     text  label
0      I grew up (b. 1965) watching and loving the Th...      0
1      When I put this movie in my DVD player, and sa...      0
2      Why do people who do not know what a particula...      0
3      Even though I have great interest in Biblical ...      0
4      Im a die hard Dads Army fan and nothing will e...      1
...                                                  ...    ...
39995  "Western Union" is something of a forgotten cl...      1
39996  This movie is an incredible piece of work. It ...      1
39997  My wife and I watched this movie because we pl...      0
39998  When I first watched Flatliners, I was amazed....      1
39999  Why would this film be so good, but only gross...      1

[40000 rows x 2 columns]>


In [86]:
# Show 2 examples from each class
print("\nPositive Reviews Examples:")
# print(df[df['label'] == '1']['text'].head(4), "\n")
# First, map them to human-readable form (optional but useful)
print(df['label'].value_counts())
df['label'] = df['label'].map({1: 'positive', 0: 'negative'})

# Then run:
print(df['label'].value_counts())
print("\nPositive Reviews Examples:")
print(df[df['label'] == 'positive']['text'].head(2), "\n")


Positive Reviews Examples:
label
0    20019
1    19981
Name: count, dtype: int64
label
negative    20019
positive    19981
Name: count, dtype: int64

Positive Reviews Examples:
4    Im a die hard Dads Army fan and nothing will e...
6    Finally watched this shocking movie last night...
Name: text, dtype: object 



In [39]:
import pandas as pd
import re
from openpyxl.utils.exceptions import IllegalCharacterError

# Define illegal character pattern
def remove_illegal_chars(text):
    if isinstance(text, str):
        # Remove characters not allowed in Excel (except for newline and tab)
        return re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', text)
    return text

# Load your CSV
df = pd.read_csvdf = pd.read_csv("C:/Users/Lenovo/Downloads/movie_reviews (1).csv", on_bad_lines='skip')

# Apply cleanup to all cells
df_cleaned = df.applymap(remove_illegal_chars)

# Save to Excel
df_cleaned.to_excel("C:/Users/Lenovo/Downloads/movie_reviews_cleaned.xlsx", index=False)


  df_cleaned = df.applymap(remove_illegal_chars)


In [92]:
# Calculate average text length for each class
df['text_length'] = df['text'].apply(lambda x: len(x.split()))
print("\nAverage Text Length by Class:")
print(df.groupby('label')['text_length'].mean())


Average Text Length by Class:
label
negative    229.204606
positive    233.477954
Name: text_length, dtype: float64


In [94]:
# 1.2 Basic Preprocessing (15 pts)

stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

df["processed_text"] = df['text'].apply(preprocess)
df[['text', 'processed_text']].head(5)
    

Unnamed: 0,text,processed_text
0,I grew up (b. 1965) watching and loving the Th...,grew b watching loving thunderbirds mates scho...
1,"When I put this movie in my DVD player, and sa...",put movie dvd player sat coke chips expectatio...
2,Why do people who do not know what a particula...,people know particular time past like feel nee...
3,Even though I have great interest in Biblical ...,even though great interest biblical movies bor...
4,Im a die hard Dads Army fan and nothing will e...,im die hard dads army fan nothing ever change ...


2.1 Apply VADER Analysis (20 pts)


In [99]:
# Initialize VADER
sia = SentimentIntensityAnalyzer()

# Get compound scores
df['compound'] = df['text'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Convert to binary label: positive if compound > 0
df['vader_prediction'] = df['compound'].apply(lambda x: 'positive' if x > 0 else 'negative')

# Show a few results
df[['text', 'compound', 'vader_prediction']].head()


Unnamed: 0,text,compound,vader_prediction
0,I grew up (b. 1965) watching and loving the Th...,0.6502,positive
1,"When I put this movie in my DVD player, and sa...",0.9314,positive
2,Why do people who do not know what a particula...,-0.9568,negative
3,Even though I have great interest in Biblical ...,-0.7515,negative
4,Im a die hard Dads Army fan and nothing will e...,0.7469,positive


In [103]:
print("VADER Evaluation:\n")
print(classification_report(df['label'], df['vader_prediction']))

print("Confusion Matrix: ")
print(confusion_matrix(df['label'], df['vader_prediction']))

VADER Evaluation:

              precision    recall  f1-score   support

    negative       0.79      0.54      0.64     20019
    positive       0.65      0.86      0.74     19981

    accuracy                           0.70     40000
   macro avg       0.72      0.70      0.69     40000
weighted avg       0.72      0.70      0.69     40000

Confusion Matrix: 
[[10765  9254]
 [ 2889 17092]]


In [105]:
#  Machine Learning Implementation (35 pts)

# 3.1 Feature Extraction 

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['processed_text'])

# Labels
y = df['label']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [111]:
# 3.2 Train and Evaluate Classifier (20 pts)

classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

# Evaluation
print("Naive Bayes Evaluation:\n")
print(classification_report(y_test, y_pred))

Naive Bayes Evaluation:

              precision    recall  f1-score   support

    negative       0.84      0.83      0.83      3966
    positive       0.83      0.85      0.84      4034

    accuracy                           0.84      8000
   macro avg       0.84      0.84      0.84      8000
weighted avg       0.84      0.84      0.84      8000



In [113]:
# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[3278  688]
 [ 622 3412]]


Question 4: Comparison and Analysis (10 pts)


In [120]:
# Comparison table
from sklearn.metrics import precision_score, recall_score

# VADER metrics
vader_acc = accuracy_score(df['label'], df['vader_prediction'])
vader_prec = precision_score(df['label'], df['vader_prediction'], pos_label = 'positive')
vader_rec = recall_score(df['label'], df['vader_prediction'], pos_label = 'positive')

# NB metrics
nb_acc = accuracy_score(y_test, y_pred)
nb_prec = precision_score(y_test, y_pred, pos_label = 'positive')
nb_rec = recall_score(y_test, y_pred, pos_label = 'positive')

# Create comparison tabel
comparison = pd.DataFrame({
    'Model' : ["VADER", "Naive Bayes"],
    "Accuracy" : [vader_acc, nb_acc],
    'Precision': [vader_prec, nb_prec],
    'Recall': [vader_rec, nb_rec]
})

print("\nComparison table: ")
print(comparison)


Comparison table: 
         Model  Accuracy  Precision    Recall
0        VADER  0.696425   0.648751  0.855413
1  Naive Bayes  0.836250   0.832195  0.845811


In [122]:
# 4.2 Critical Analysis (5 pts)

VADER:
+ Advantage: Pre-trained and works well on social media or informal text.
- Disadvantage: May miss domain-specific context and nuances in longer reviews.

Naive Bayes:
+ Advantage: Learns from data, can capture domain-specific patterns.
- Disadvantage: Requires preprocessing and training, and performance depends on data quality.

For a production system, I would choose the Naive Bayes approach due to its better adaptability to the specific dataset and the potential for improvement with more data and tuning. VADER is great for quick prototyping, but ML models offer more control and customizability.


In [127]:
print(df['label'].unique())
print(df['label'].dtype)

['negative' 'positive']
object
