<a href="https://colab.research.google.com/github/HammadSiddiqui30/Autonomous_Vehicle_Perception/blob/master/Sentiment_Analysis_on_Movie_Reviews(NLP).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

***Sentiment Analysis on Movie Reviews***

In [1]:
pip install nltk scikit-learn




In [2]:
import nltk
from nltk.corpus import movie_reviews
import random

# Download NLTK data
nltk.download('movie_reviews')
nltk.download('punkt')

# Load movie reviews data
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the documents to ensure randomness
random.shuffle(documents)

print(f'Total documents: {len(documents)}')


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Total documents: 2000


In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(' '.join(text))
    tokens = [word.lower() for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Preprocess the documents
documents = [(preprocess_text(document), category) for document, category in documents]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
from nltk import FreqDist

# Get the list of all words in the corpus
all_words = [word for document, category in documents for word in document]
all_words_freq = FreqDist(all_words)

# Select the top 2000 most frequent words as features
word_features = list(all_words_freq.keys())[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features[f'contains({word})'] = (word in document_words)
    return features

# Create feature sets
feature_sets = [(document_features(document), category) for document, category in documents]


In [5]:
from sklearn.model_selection import train_test_split
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy

# Split the data into training and testing sets
train_set, test_set = train_test_split(feature_sets, test_size=0.2, random_state=42)

# Train the Naive Bayes classifier
classifier = NaiveBayesClassifier.train(train_set)

# Evaluate the classifier
print(f'Accuracy: {accuracy(classifier, test_set) * 100:.2f}%')

# Show the most informative features
classifier.show_most_informative_features(10)


Accuracy: 78.75%
Most Informative Features
    contains(astounding) = True              pos : neg    =     11.1 : 1.0
        contains(debate) = True              pos : neg    =     11.1 : 1.0
      contains(poignant) = True              pos : neg    =     10.3 : 1.0
     contains(fashioned) = True              pos : neg    =      9.5 : 1.0
   contains(outstanding) = True              pos : neg    =      9.4 : 1.0
        contains(finest) = True              pos : neg    =      8.2 : 1.0
 contains(unimaginative) = True              neg : pos    =      7.6 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.9 : 1.0
          contains(earl) = True              pos : neg    =      6.4 : 1.0
       contains(layered) = True              pos : neg    =      6.4 : 1.0


In [6]:
def classify_review(review_text):
    review_tokens = preprocess_text(review_text)
    review_features = document_features(review_tokens)
    return classifier.classify(review_features)

# Example review
review = "This movie was absolutely fantastic! The story was engaging and the characters were well-developed."
print(f'Review: {review}')
print(f'Sentiment: {classify_review(review)}')


Review: This movie was absolutely fantastic! The story was engaging and the characters were well-developed.
Sentiment: neg


***NLTK WITH SVM***

In [7]:
import nltk
from nltk.corpus import movie_reviews
import random
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Download NLTK data
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')

# Load movie reviews data
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the documents to ensure randomness
random.shuffle(documents)

# Preview of the dataset
print(f'Total documents: {len(documents)}')
print(f'Sample document (first 100 words): {documents[0][0][:100]}')
print(f'Category: {documents[0][1]}')

# Define stop words
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize the text (split it into individual words)
    tokens = word_tokenize(' '.join(text))
    # Convert words to lowercase and remove non-alphabetic tokens
    tokens = [word.lower() for word in tokens if word.isalpha()]
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Preprocess the documents
documents = [(preprocess_text(document), category) for document, category in documents]

# Convert the documents into a format suitable for machine learning
texts = [' '.join(document) for document, category in documents]
labels = [category for document, category in documents]

# Split the data into training and testing sets (80% train, 20% test)
texts_train, texts_test, labels_train, labels_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Create a pipeline with TfidfVectorizer and SVC
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svc', SVC(kernel='linear'))
])

# Train the model
pipeline.fit(texts_train, labels_train)

# Evaluate the model
predictions = pipeline.predict(texts_test)
print(f'Accuracy: {accuracy_score(labels_test, predictions) * 100:.2f}%')

# Function to classify a new review
def classify_review(review_text):
    return pipeline.predict([review_text])[0]

# Example reviews to classify
reviews = [
    "This movie was absolutely fantastic! The story was engaging and the characters were well-developed.",
    "The movie was terrible. The plot was boring and the acting was awful.",
    "An average movie with some good moments but overall not very memorable."
]

for review in reviews:
    sentiment = classify_review(review)
    print(f'Review: {review}')
    print(f'Sentiment: {sentiment}')


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Total documents: 2000
Sample document (first 100 words): ['it', "'", 's', 'tough', 'to', 'really', 'say', 'something', 'nice', 'about', 'a', 'type', 'of', 'person', 'who', "'", 's', 'so', 'ethnocentric', 'that', 'any', 'humanity', 'they', 'once', 'had', 'is', 'now', 'gone', ',', 'but', 'by', 'god', ',', '"', 'american', 'history', 'x', '"', 'does', 'it', ',', 'and', 'for', 'that', ',', 'i', 'commend', 'it', '.', 'it', 'not', 'only', 'takes', 'balls', 'but', 'intelligence', 'to', 'make', 'a', 'human', 'being', 'out', 'of', 'a', 'neo', '-', 'nazi', 'skinhead', ',', 'a', 'kind', 'of', 'person', 'who', 'dedicates', 'their', 'lives', 'to', 'hating', 'anyone', 'who', "'", 's', 'not', 'what', 'they', 'are', ',', 'and', 'this', 'film', 'wisely', 'and', 'miracurously', 'pulls', 'it', 'off', '.', 'the', 'subject']
Category: pos
Accuracy: 84.25%
Review: This movie was absolutely fantastic! The story was engaging and the characters were well-developed.
Sentiment: pos
Review: The movie was terrible