In [None]:
# Install required packages first
!pip install datasets transformers tqdm wordcloud nltk pandas numpy matplotlib seaborn scikit-learn torch
print("✅ Packages installed successfully!")

import nltk
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from tqdm import tqdm
import time

# Download necessary NLTK resources
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

print("Loading dataset...")
# Load the IMDB dataset
dataset = load_dataset("imdb")
df = pd.DataFrame(dataset["train"])
df = df[['text', 'label']]
df.rename(columns={"text": "review", "label": "sentiment"}, inplace=True)
df['sentiment'] = df['sentiment'].map({1: 1, 0: 0})

# Get English stopwords
stop_words = set(stopwords.words("english"))

def clean_text(text):
    """Clean and preprocess text by removing special characters and stopwords"""
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)  # Keep numbers, remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply text cleaning with progress bar
print("Cleaning text...")
tqdm.pandas(desc="Cleaning reviews")
df['cleaned_review'] = df['review'].progress_apply(clean_text)

# Create a function to visualize the data
def visualize_data(df):
    plt.figure(figsize=(12, 5))

    # Plot sentiment distribution
    plt.subplot(1, 2, 1)
    sns.countplot(x='sentiment', data=df)
    plt.title('Sentiment Distribution')
    plt.xlabel('Sentiment (0=Negative, 1=Positive)')
    plt.ylabel('Count')

    # Generate and plot word cloud for positive reviews
    plt.subplot(1, 2, 2)
    positive_text = ' '.join(df[df['sentiment'] == 1]['cleaned_review'])
    wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(positive_text)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Positive Reviews Word Cloud')

    plt.tight_layout()
    plt.savefig('sentiment_visualization.png')
    plt.close()
    print("✅ Visualization saved as 'sentiment_visualization.png'")

# Split data into train and test sets
print("Splitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(
    df["cleaned_review"],
    df["sentiment"],
    test_size=0.2,
    random_state=RANDOM_SEED,
    stratify=df["sentiment"]  # Ensure balanced classes in both sets
)

# TF-IDF Vectorization
print("Vectorizing text data...")
vectorizer = TfidfVectorizer(
    max_features=10000,  # Increased from 5000
    ngram_range=(1, 3),  # Capture up to trigrams
    min_df=3,            # Ignore terms that appear in less than 3 documents
    max_df=0.9           # Ignore terms that appear in more than 90% of documents
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("✅ Data processing complete. Ready for model training!")

# Train and evaluate traditional ML models
def train_evaluate_model(model, name, X_train, X_test, y_train, y_test):
    start_time = time.time()
    print(f"Training {name}...")
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"✅ {name} Accuracy: {accuracy:.4f}")
    print(f"✅ {name} Training Time: {time.time() - start_time:.2f} seconds")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
    plt.title(f'{name} Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(f'{name.lower().replace(" ", "_")}_confusion_matrix.png')
    plt.close()

    return model, accuracy

# Logistic Regression with better hyperparameters
model_lr = LogisticRegression(
    max_iter=2000,
    C=1.0,               # Regularization strength
    class_weight='balanced',  # Handle class imbalance
    solver='liblinear',   # Efficient for smaller datasets
    random_state=RANDOM_SEED
)
model_lr, accuracy_lr = train_evaluate_model(model_lr, "Logistic Regression", X_train_tfidf, X_test_tfidf, y_train, y_test)

# Naive Bayes
model_nb = MultinomialNB(alpha=0.1)  # Smoothing parameter
model_nb, accuracy_nb = train_evaluate_model(model_nb, "Naive Bayes", X_train_tfidf, X_test_tfidf, y_train, y_test)

# Select the best model
best_model_name = "Naive Bayes" if accuracy_nb > accuracy_lr else "Logistic Regression"
best_model = model_nb if accuracy_nb > accuracy_lr else model_lr
print(f"\n✅ Best traditional model: {best_model_name} with accuracy {max(accuracy_lr, accuracy_nb):.4f}")

# Function for feature importance analysis (for Logistic Regression)
def analyze_feature_importance(model, vectorizer, top_n=20):
    if isinstance(model, LogisticRegression):
        # Get feature names
        feature_names = vectorizer.get_feature_names_out()

        # Get coefficients from logistic regression model
        coefficients = model.coef_[0]

        # Create DataFrame for feature importance
        feature_importance = pd.DataFrame({
            'Feature': feature_names,
            'Importance': coefficients
        })

        # Sort by absolute importance
        feature_importance['Abs_Importance'] = np.abs(feature_importance['Importance'])
        feature_importance = feature_importance.sort_values('Abs_Importance', ascending=False)

        # Plot top positive and negative features
        plt.figure(figsize=(12, 8))

        # Top positive features
        plt.subplot(1, 2, 1)
        top_positive = feature_importance[feature_importance['Importance'] > 0].head(top_n)
        sns.barplot(x='Importance', y='Feature', data=top_positive, palette='viridis')
        plt.title(f'Top {top_n} Positive Features')

        # Top negative features
        plt.subplot(1, 2, 2)
        top_negative = feature_importance[feature_importance['Importance'] < 0].head(top_n)
        sns.barplot(x='Importance', y='Feature', data=top_negative, palette='viridis')
        plt.title(f'Top {top_n} Negative Features')

        plt.tight_layout()
        plt.savefig('feature_importance.png')
        plt.close()
        print("✅ Feature importance analysis saved as 'feature_importance.png'")

        return feature_importance.head(top_n)

# Analyze feature importance if Logistic Regression is the best model
if isinstance(best_model, LogisticRegression):
    top_features = analyze_feature_importance(best_model, vectorizer)
    print("\nTop Features for Sentiment Classification:")
    print(top_features)

# Generate visualizations
visualize_data(df)

# Function to predict sentiment with explanation
def predict_sentiment(review, model=best_model, vectorizer=vectorizer, explain=True):
    # Clean the review
    review_clean = clean_text(review)

    # Transform with TF-IDF
    review_tfidf = vectorizer.transform([review_clean])

    # Predict
    prediction = model.predict(review_tfidf)[0]
    probability = model.predict_proba(review_tfidf)[0]

    # Determine sentiment
    sentiment = "Positive 😀" if prediction == 1 else "Negative 😡"
    confidence = probability[1] if prediction == 1 else probability[0]

    result = {
        "review": review,
        "sentiment": sentiment,
        "confidence": f"{confidence:.2%}"
    }

    # Provide explanation if requested
    if explain and isinstance(model, LogisticRegression):
        # Get feature names
        feature_names = np.array(vectorizer.get_feature_names_out())

        # Get coefficients for this instance
        coef = model.coef_[0]

        # Get the TF-IDF values for this review
        tfidf_values = review_tfidf.toarray()[0]

        # Calculate feature contributions
        feature_contributions = coef * tfidf_values

        # Get top contributing features
        if prediction == 1:  # Positive sentiment
            top_indices = np.argsort(-feature_contributions)[:5]
        else:  # Negative sentiment
            top_indices = np.argsort(feature_contributions)[:5]

        top_features = feature_names[top_indices]
        result["contributing_terms"] = list(top_features)

    return result

# Test with examples
print("\nTesting the model with example reviews:")
examples = [
    "This movie was absolutely fantastic! I loved every second of it.",
    "Worst movie ever. It was so boring and a waste of time.",
    "The film had good moments but overall it was disappointing.",
    "Amazing performances by all actors, especially the lead. Great directing too!"
]

for example in examples:
    result = predict_sentiment(example)
    print(f"\nReview: {result['review']}")
    print(f"Predicted Sentiment: {result['sentiment']} (Confidence: {result['confidence']})")
    if "contributing_terms" in result:
        print(f"Top contributing terms: {', '.join(result['contributing_terms'])}")

print("\n✅ Sentiment analysis project completed successfully!")






Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Cleaning text...


Cleaning reviews: 100%|██████████| 25000/25000 [00:04<00:00, 5699.91it/s]


Splitting data into train and test sets...
Vectorizing text data...
✅ Data processing complete. Ready for model training!
Training Logistic Regression...
✅ Logistic Regression Accuracy: 0.8924
✅ Logistic Regression Training Time: 0.51 seconds

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      2500
           1       0.88      0.91      0.89      2500

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000

Training Naive Bayes...
✅ Naive Bayes Accuracy: 0.8630
✅ Naive Bayes Training Time: 0.02 seconds

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.85      0.86      2500
           1       0.86      0.87      0.86      2500

    accuracy                           0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg 


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=top_positive, palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=top_negative, palette='viridis')


✅ Feature importance analysis saved as 'feature_importance.png'

Top Features for Sentiment Classification:
            Feature  Importance  Abs_Importance
9854          worst   -7.364691        7.364691
735             bad   -7.048920        7.048920
4126          great    5.847759        5.847759
703           awful   -5.780678        5.780678
3166      excellent    5.482056        5.482056
1052         boring   -5.116787        5.116787
9588          waste   -4.920640        4.920640
6911           poor   -4.897434        4.897434
893            best    4.669223        4.669223
6728        perfect    4.396526        4.396526
9808      wonderful    4.373603        4.373603
8869       terrible   -4.292384        4.292384
6303        nothing   -4.252912        4.252912
9850          worse   -4.201747        4.201747
4462       horrible   -4.115369        4.115369
430         amazing    3.840192        3.840192
6913         poorly   -3.826718        3.826718
2810           dull   -3.795