<a href="https://colab.research.google.com/github/Jhansipothabattula/Data_Science/blob/main/Day171.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis on Text Data using NLTK

In [2]:
# Import necessary libraries
import nltk
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy as nltk_accuracy
from nltk.corpus import stopwords
import random

# Download the NLTK data files
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Added to resolve LookupError

# Preprocess the dataset and extract features
def extract_features(words):
    return {word: True for word in words}

# Load the movie_reviews dataset from NLTK
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the dataset to ensure random distribution
random.shuffle(documents)

# Prepare the dataset for training and testing
featuresets = [(extract_features(d), c) for (d, c) in documents]
train_set, test_set = featuresets[:1600], featuresets[1600:]

# Train the Naive Bayes Classifier
classifier = NaiveBayesClassifier.train(train_set)

# Evaluate the classifier on the test set
accuracy = nltk_accuracy(classifier, test_set)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Show the most informative features
classifier.show_most_informative_features(10)

# Test on new input sentences
def analyze_sentiment(text):
    # Tokenize and remove stopwords
    words = nltk.word_tokenize(text)
    words = [word for word in words if word.lower() not in stopwords.words('english')]

    # Predict sentiment
    features = extract_features(words)
    return classifier.classify(features)

# Test the classifier with some custom text inputs
test_sentences = [
    "This movie is absolutely fantastic! The acting, the story, everything was amazing!",
    "I hated this movie. It was a waste of time and money.",
    "The plot was a bit dull, but the performances were great.",
    "I have mixed feelings about this film. It was okay, not great but not terrible either."
]

for sentence in test_sentences:
    print(f"Sentence: {sentence}")
    print(f"Predicted sentiment: {analyze_sentiment(sentence)}")
    print()

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Accuracy: 73.75%
Most Informative Features
                   damon = True              pos : neg    =     15.5 : 1.0
               ludicrous = True              neg : pos    =     13.6 : 1.0
            breathtaking = True              pos : neg    =     12.4 : 1.0
                 idiotic = True              neg : pos    =     11.9 : 1.0
                downhill = True              neg : pos    =     11.8 : 1.0
             outstanding = True              pos : neg    =     11.7 : 1.0
                   blend = True              pos : neg    =     11.5 : 1.0
              astounding = True              pos : neg    =     10.9 : 1.0
                   sucks = True              neg : pos    =     10.3 : 1.0
                  finest = True              pos : neg    =     10.3 : 1.0
Sentence: This movie is absolutely fantastic! The acting, the story, everything was amazing!
Predicted sentiment: pos

Sentence: I hated this movie. It was a waste of time and money.
Predicted sentiment: neg