In [1]:
import pandas as pd
import string
import nltk
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords, opinion_lexicon
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report



# Download required NLTK datasets and lexicons
nltk.download([
    'punkt',            # Punkt tokenizer models for word and sentence tokenization
    'stopwords',        # List of common stop words for multiple languages
    'opinion_lexicon',  # Lexicon of positive and negative opinion words
    'vader_lexicon',    # Specialized lexicon for VADER sentiment analysis
    'punkt_tab',        # Additional Punkt tokenizer tables (improves tokenization)
    'movie_reviews'     # Sample movie reviews for sentiment analysis
])

#'punkt': Pre-trained tokenizer that splits text into sentences and words
#'stopwords': Common words to filter out (e.g., "the", "and", "a")
#'opinion_lexicon': Word lists for basic sentiment analysis (positive/negative)
#'vader_lexicon': Special sentiment lexicon including slang and emoticons
#'punkt_tab': Supplemental data for bett er tokenization of non-standard text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package opinion_lexicon to /root/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [11]:
class SentimentAnalyzer:
    """Machine learning based sentiment analyzer"""

    def __init__(self):
        self.classifier = None
        self.vader = SentimentIntensityAnalyzer()
        self.positive_words = set(opinion_lexicon.positive())
        self.negative_words = set(opinion_lexicon.negative())
        self.stop_words = set(stopwords.words('english'))

    def preprocess(self, text):
        """Clean and tokenize text"""
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        tokens = word_tokenize(text)
        return [word for word in tokens if word not in self.stop_words]

    def SimpleSentimentAnalyzer(self, text):
        """Analyze sentiment of text"""
        tokens = self.preprocess(text)
        pos_count = len([word for word in tokens if word in self.positive_words])
        neg_count = len([word for word in tokens if word in self.negative_words])

        if pos_count > neg_count:
            return "positive"
        elif neg_count > pos_count:
            return "negative"
        else:
            return "neutral"

    def prepare_training_data(self):
        """Prepare sample training data (in a real app, you'd use a proper dataset)"""
        positive_samples = [
            ("I love this movie", "positive"),
            ("Great performance by the actors", "positive"),
            ("The plot was amazing", "positive"),
            ("Wonderful cinematography", "positive"),
            ("Highly recommended", "positive")
        ]

        negative_samples = [
            ("I hate this movie", "negative"),
            ("Terrible acting", "negative"),
            ("The plot was boring", "negative"),
            ("Waste of time", "negative"),
            ("Disappointing ending", "negative")
        ]

        neutral_samples = [
            ("The movie was okay", "neutral"),
            ("It was neither good nor bad", "neutral"),
            ("Average performance", "neutral"),
            ("The film was watchable", "neutral"),
            ("Nothing special", "neutral")
        ]

        return positive_samples + negative_samples + neutral_samples

    def extract_features(self, text):
        """Extract features from text for ML model"""
        words = word_tokenize(text.lower())
        words = [word for word in words if word not in stopwords.words('english')]
        return {word: True for word in words}

    def train(self):
        """Train the Naive Bayes classifier"""
        training_data = self.prepare_training_data()
        featuresets = [(self.extract_features(text), label) for (text, label) in training_data]
        self.classifier = NaiveBayesClassifier.train(featuresets)

    def analyze(self, text):
        """Analyze sentiment using ML and VADER"""
        # ML approach
        features = self.extract_features(text)
        ml_result = self.classifier.classify(features)

        # VADER approach (rule-based alternative)
        vader_scores = self.vader.polarity_scores(text)
        vader_result = "neutral"
        if vader_scores['compound'] >= 0.05:
            vader_result = "positive"
        elif vader_scores['compound'] <= -0.05:
            vader_result = "negative"

        return {
            "ML_classifier": ml_result,
            "VADER": vader_result,
            "VADER_scores": vader_scores
        }

In [12]:
def main():
    print("=== Sentiment Analysis ===")

    # Sample texts to analyze
    texts = [
        "I absolutely loved the movie! The acting was superb.",
        "The film was terrible. I hated every minute of it.",
        "It was okay. Nothing special, but not bad either.",
        "The cinematography was beautiful, but the plot was weak.",
        "This is the worst movie I've ever seen in my life!"
    ]

    st_analyzer = SentimentAnalyzer()
    st_analyzer.train()

    for text in texts:
        results = st_analyzer.analyze(text)
        sentiment = st_analyzer.SimpleSentimentAnalyzer(text)
        print(f"\nText: {text[:50]}...")
        print(f"ML Classifier: {results['ML_classifier']}")
        print(f"Lexicon-Based Analyzer: {sentiment}")
        print(f"VADER: {results['VADER']}")
        print(f"VADER Scores: {results['VADER_scores']}")

if __name__ == "__main__":
    main()

=== Sentiment Analysis ===

Text: I absolutely loved the movie! The acting was super...
ML Classifier: negative
Lexicon-Based Analyzer: positive
VADER: positive
VADER Scores: {'neg': 0.0, 'neu': 0.411, 'pos': 0.589, 'compound': 0.862}

Text: The film was terrible. I hated every minute of it....
ML Classifier: neutral
Lexicon-Based Analyzer: negative
VADER: negative
VADER Scores: {'neg': 0.51, 'neu': 0.49, 'pos': 0.0, 'compound': -0.8074}

Text: It was okay. Nothing special, but not bad either....
ML Classifier: neutral
Lexicon-Based Analyzer: negative
VADER: positive
VADER Scores: {'neg': 0.127, 'neu': 0.467, 'pos': 0.406, 'compound': 0.5568}

Text: The cinematography was beautiful, but the plot was...
ML Classifier: positive
Lexicon-Based Analyzer: negative
VADER: negative
VADER Scores: {'neg': 0.289, 'neu': 0.526, 'pos': 0.184, 'compound': -0.34}

Text: This is the worst movie I've ever seen in my life!...
ML Classifier: positive
Lexicon-Based Analyzer: negative
VADER: negative
VADER

In [18]:
class SentimentAnalyzer:
    def __init__(self):
        # Initialize all analyzers
        self.vader = SentimentIntensityAnalyzer()
        self.stop_words = set(stopwords.words('english'))

        # For ML model
        self.ml_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer( # TF-IDF = TF × IDF Term Frequency (TF) Inverse Document Frequency (IDF)
                lowercase=True,
                stop_words='english',
                ngram_range=(1, 2),
                max_features=5000)),
            ('clf', LinearSVC(
                random_state=42,
                class_weight='balanced'))
        ])

    def clean_text(self, text):
        """Basic text cleaning"""
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        tokens = word_tokenize(text)
        return ' '.join([word for word in tokens if word not in self.stop_words])

    def analyze_vader(self, text):
        """Analyze sentiment using VADER"""
        scores = self.vader.polarity_scores(text)
        if scores['compound'] >= 0.05:
            return 'positive'
        elif scores['compound'] <= -0.05:
            return 'negative'
        else:
            return 'neutral'

    def analyze_textblob(self, text):
        """Analyze sentiment using TextBlob"""
        analysis = TextBlob(text)
        if analysis.sentiment.polarity > 0.1:
            return 'positive'
        elif analysis.sentiment.polarity < -0.1:
            return 'negative'
        else:
            return 'neutral'

    def train_ml_model(self, X_train, y_train):
        """Train the machine learning model"""
        self.ml_pipeline.fit(X_train, y_train)

    def analyze_ml(self, text):
        """Analyze sentiment using trained ML model"""
        return self.ml_pipeline.predict([text])[0]

    def evaluate_models(self, X_test, y_test):
        """Evaluate all models on test data with proper zero division handling"""
        results = {}
        labels = ['positive', 'negative', 'neutral']

        # VADER evaluation
        vader_preds = [self.analyze_vader(text) for text in X_test]
        results['VADER'] = {
            'accuracy': accuracy_score(y_test, vader_preds),
            'report': classification_report(
                y_test, vader_preds,
                labels=labels,
                zero_division=0
            )
        }

        # TextBlob evaluation
        textblob_preds = [self.analyze_textblob(text) for text in X_test]
        results['TextBlob'] = {
            'accuracy': accuracy_score(y_test, textblob_preds),
            'report': classification_report(
                y_test, textblob_preds,
                labels=labels,
                zero_division=0
            )
        }

        # ML model evaluation
        ml_preds = self.ml_pipeline.predict(X_test)
        results['LinearSVC'] = {
            'accuracy': accuracy_score(y_test, ml_preds),
            'report': classification_report(
                y_test, ml_preds,
                labels=labels,
                zero_division=0
            )
        }

        return results

In [19]:
def sample_data():
    """Create a larger, more balanced sample dataset"""
    positive_samples = [
        ("I love this movie! The acting was amazing.", "positive"),
        ("One of the best movies I've seen this year!", "positive"),
        ("Absolutely fantastic from start to finish!", "positive"),
        ("The cinematography was stunning and moving.", "positive"),
        ("The cast delivered outstanding performances.", "positive"),
        ("A masterpiece of modern cinema.", "positive"),
        ("I was completely captivated throughout.", "positive"),
        ("The director's vision was brilliantly executed.", "positive"),
        ("This film touched me deeply.", "positive"),
        ("Worth watching multiple times.", "positive")
    ]

    negative_samples = [
        ("Terrible film. Waste of time and money.", "negative"),
        ("I hated every minute of this awful movie.", "negative"),
        ("Boring and predictable, wouldn't recommend.", "negative"),
        ("The worst script I've ever seen.", "negative"),
        ("Painfully bad acting from everyone.", "negative"),
        ("A complete disappointment on every level.", "negative"),
        ("I want my two hours back.", "negative"),
        ("Unbearably dull and pointless.", "negative"),
        ("The editing was atrocious.", "negative"),
        ("Not a single redeeming quality.", "negative")
    ]

    neutral_samples = [
        ("It was okay, nothing special.", "neutral"),
        ("The plot was good but the ending disappointed me.", "neutral"),
        ("Average performance with some good moments.", "neutral"),
        ("The cinematography was beautiful but the story was weak.", "neutral"),
        ("Neither good nor bad, just average.", "neutral"),
        ("Some parts worked, others didn't.", "neutral"),
        ("Competently made but forgettable.", "neutral"),
        ("Had potential but didn't quite deliver.", "neutral"),
        ("A mixed bag of quality.", "neutral"),
        ("Not terrible but not great either.", "neutral")
    ]

    all_samples = positive_samples + negative_samples + neutral_samples
    df = pd.DataFrame(all_samples, columns=['text', 'sentiment'])
    return df


In [20]:
def main():
    # Load larger sample data
    df = sample_data()
    analyzer = SentimentAnalyzer()
    df['cleaned_text'] = df['text'].apply(analyzer.clean_text)

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'],
        df['sentiment'],
        test_size=0.3,
        random_state=42,
        stratify=df['sentiment']
    )

    # Train ML model
    analyzer.train_ml_model(X_train, y_train)

    # Evaluate all models
    results = analyzer.evaluate_models(X_test, y_test)

    # Print results
    print("=== Sentiment Analysis Evaluation Results ===")
    for model_name, metrics in results.items():
        print(f"\n{model_name} Results:")
        print(f"Accuracy: {metrics['accuracy']:.2f}")
        print("Classification Report:")
        print(metrics['report'])

    # Demo analysis on new text
    test_texts = [
        "This product is absolutely wonderful!",
        "I'm very disappointed with the service.",
        "It's neither good nor bad, just average.",
        "The acting was superb but the story was lacking.",
        "A complete waste of money, I regret buying this."
    ]

    print("\n=== Live Sentiment Analysis ===")
    for text in test_texts:
        print(f"\nText: {text[:60]}...")
        print(f"VADER: {analyzer.analyze_vader(text)}")
        print(f"TextBlob: {analyzer.analyze_textblob(text)}")
        print(f"LinearSVC: {analyzer.analyze_ml(text)}")

if __name__ == "__main__":
    main()

=== Sentiment Analysis Evaluation Results ===

VADER Results:
Accuracy: 0.67
Classification Report:
              precision    recall  f1-score   support

    positive       0.50      1.00      0.67         3
    negative       1.00      0.67      0.80         3
     neutral       1.00      0.33      0.50         3

    accuracy                           0.67         9
   macro avg       0.83      0.67      0.66         9
weighted avg       0.83      0.67      0.66         9


TextBlob Results:
Accuracy: 0.78
Classification Report:
              precision    recall  f1-score   support

    positive       0.75      1.00      0.86         3
    negative       1.00      0.67      0.80         3
     neutral       0.67      0.67      0.67         3

    accuracy                           0.78         9
   macro avg       0.81      0.78      0.77         9
weighted avg       0.81      0.78      0.77         9


LinearSVC Results:
Accuracy: 0.44
Classification Report:
              precision