In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import nltk

In [3]:
# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mehna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mehna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mehna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# Step 1: Data sourcing
# Using IMDb movie reviews dataset from NLTK
from nltk.corpus import movie_reviews

In [7]:
# Prepare the dataset
def load_data():
    """
    Load and prepare the dataset for analysis.
    Each review is paired with its sentiment (positive or negative).
    """
    documents = [(list(movie_reviews.words(fileid)), category)
                 for category in movie_reviews.categories()
                 for fileid in movie_reviews.fileids(category)]
    
    data = pd.DataFrame(documents, columns=['review', 'sentiment'])
    data['review'] = data['review'].apply(lambda x: ' '.join(x))
    return data

In [9]:
# Load the data
data = load_data()

In [11]:
# Display the first few rows of the dataset
print("Sample data:")
print(data.head())

Sample data:
                                              review sentiment
0  plot : two teen couples go to a church party ,...       neg
1  the happy bastard ' s quick movie review damn ...       neg
2  it is movies like these that make a jaded movi...       neg
3  " quest for camelot " is warner bros . ' first...       neg
4  synopsis : a mentally unstable man undergoing ...       neg


In [13]:
# Step 2: Data preprocessing
# Initialize lemmatizer and define stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [15]:
def preprocess_text(text):
    """
    Preprocess the text by:
    - Removing special characters, numbers, and punctuation
    - Converting to lowercase
    - Tokenizing
    - Removing stopwords
    - Lemmatizing
    """
    # Cleaning: Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenization: Split the text into individual tokens
    tokens = word_tokenize(text)
    # Stopword removal and Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [17]:
# Apply preprocessing to the reviews
data['cleaned_review'] = data['review'].apply(preprocess_text)

In [19]:
# Step 3: Splitting the data into training and testing sets
X = data['cleaned_review']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Step 4: Feature extraction (vectorization)
# Using TF-IDF vectorization to convert text to numerical representation
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [23]:
# Step 5: Model training
# Training a Multinomial Naive Bayes model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [25]:
# Step 6: Model evaluation
# Evaluate the model's performance on the test data
y_pred = model.predict(X_test_vec)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")


Classification Report:
              precision    recall  f1-score   support

         neg       0.78      0.85      0.81       199
         pos       0.84      0.76      0.80       201

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.80       400
weighted avg       0.81      0.81      0.80       400

Accuracy: 80.50%


In [27]:
# Step 7: Custom review testing
# Test the model with a custom review
custom_review = "The product was great and exceeded my expectations!"
custom_review_cleaned = preprocess_text(custom_review)
custom_review_vec = vectorizer.transform([custom_review_cleaned])
predicted_sentiment = model.predict(custom_review_vec)
print(f"\nCustom Review Sentiment: {predicted_sentiment[0]}")


Custom Review Sentiment: pos
