# Week 6, Day 2: Text Classification and Sentiment Analysis

## Learning Objectives
- Understand text classification concepts
- Learn sentiment analysis techniques
- Master feature engineering for text
- Practice implementing classifiers

## Topics Covered
1. Text Classification Basics
2. Feature Engineering
3. Sentiment Analysis
4. Model Evaluation

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

## 1. Text Classification Basics

In [None]:
def text_classification_example():
    # Sample dataset
    texts = [
        "This movie was fantastic! Great acting and plot.",
        "Terrible waste of time. Poor acting and boring story.",
        "Amazing film, highly recommended!",
        "Don't waste your money on this movie.",
        "Excellent performance by the entire cast.",
        "One of the worst movies I've ever seen.",
        "A masterpiece of modern cinema.",
        "Complete disappointment, save your time.",
        "Brilliant direction and storytelling.",
        "Awful plot, terrible acting, avoid at all costs."
    ]
    
    labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # 1: positive, 0: negative
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        texts, labels, test_size=0.2, random_state=42
    )
    
    # Create TF-IDF features
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    
    # Train models
    models = {
        'Naive Bayes': MultinomialNB(),
        'Logistic Regression': LogisticRegression(random_state=42)
    }
    
    results = {}
    for name, model in models.items():
        # Train model
        model.fit(X_train_tfidf, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test_tfidf)
        
        # Store results
        results[name] = {
            'predictions': y_pred,
            'report': classification_report(y_test, y_pred)
        }
    
    # Print results
    for name, result in results.items():
        print(f"\nResults for {name}:")
        print(result['report'])
        
        # Plot confusion matrix
        plt.figure(figsize=(6, 4))
        cm = confusion_matrix(y_test, result['predictions'])
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix - {name}')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.show()

text_classification_example()

## 2. Feature Engineering

In [None]:
def feature_engineering_example():
    # Sample text
    text = "This movie was AMAZING! The acting was great, and I loved the story. Must watch!!!"
    
    # Basic features
    def extract_features(text):
        features = {}
        
        # Text length
        features['text_length'] = len(text)
        
        # Word count
        words = word_tokenize(text)
        features['word_count'] = len(words)
        
        # Average word length
        features['avg_word_length'] = np.mean([len(word) for word in words])
        
        # Uppercase word count
        features['uppercase_count'] = sum(1 for word in words if word.isupper())
        
        # Exclamation mark count
        features['exclamation_count'] = text.count('!')
        
        return features
    
    # Extract and display features
    features = extract_features(text)
    
    # Plot features
    plt.figure(figsize=(10, 5))
    plt.bar(features.keys(), features.values())
    plt.title('Text Features')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Print features
    for feature, value in features.items():
        print(f"{feature}: {value}")

feature_engineering_example()

## 3. Sentiment Analysis

In [None]:
def sentiment_analysis_example():
    # Download NLTK resources
    nltk.download('vader_lexicon')
    from nltk.sentiment import SentimentIntensityAnalyzer
    
    # Sample reviews
    reviews = [
        "This product is amazing! I love everything about it.",
        "Terrible quality, complete waste of money.",
        "It's okay, nothing special but gets the job done.",
        "Could be better, but not the worst I've seen.",
        "Absolutely fantastic service and product quality!"
    ]
    
    # Initialize analyzer
    sia = SentimentIntensityAnalyzer()
    
    # Analyze sentiments
    results = []
    for review in reviews:
        scores = sia.polarity_scores(review)
        results.append({
            'text': review,
            'compound': scores['compound'],
            'positive': scores['pos'],
            'neutral': scores['neu'],
            'negative': scores['neg']
        })
    
    # Create DataFrame
    df = pd.DataFrame(results)
    
    # Visualize results
    plt.figure(figsize=(12, 6))
    
    # Sentiment distribution
    plt.subplot(121)
    plt.bar(range(len(df)), df['compound'])
    plt.axhline(y=0, color='r', linestyle='--')
    plt.title('Compound Sentiment Scores')
    plt.xlabel('Review')
    plt.ylabel('Score')
    
    # Sentiment components
    plt.subplot(122)
    df[['positive', 'neutral', 'negative']].plot(kind='bar', stacked=True)
    plt.title('Sentiment Components')
    plt.xlabel('Review')
    plt.ylabel('Score')
    
    plt.tight_layout()
    plt.show()
    
    # Print detailed analysis
    for result in results:
        print(f"\nText: {result['text']}")
        print(f"Compound Score: {result['compound']:.3f}")
        print(f"Sentiment: {'Positive' if result['compound'] > 0 else 'Negative' if result['compound'] < 0 else 'Neutral'}")

sentiment_analysis_example()

## Practical Exercises

In [None]:
# Exercise 1: Custom Text Classifier

def text_classifier_exercise():
    # Sample dataset
    data = {
        'text': [
            "I love this product! Best purchase ever!",
            "Terrible experience, never buying again.",
            "Average product, nothing special.",
            "Great customer service and quality.",
            "Waste of money, very disappointed."
        ],
        'sentiment': ['positive', 'negative', 'neutral', 'positive', 'negative']
    }
    
    print("Task: Build a multi-class sentiment classifier")
    print("1. Preprocess the texts")
    print("2. Engineer relevant features")
    print("3. Train a classifier")
    print("4. Evaluate performance")
    
    # Your code here

text_classifier_exercise()

In [None]:
# Exercise 2: Advanced Feature Engineering

def feature_engineering_exercise():
    # Sample text
    text = """
    This product is AMAZING! The quality is great and the price is reasonable.
    Customer service was very helpful when I had questions. Shipping was fast too!
    Definitely recommend this to everyone!!!
    """
    
    print("Task: Create advanced text features")
    print("1. Design custom features")
    print("2. Implement feature extraction")
    print("3. Analyze feature importance")
    print("4. Visualize results")
    
    # Your code here

feature_engineering_exercise()

## MCQ Quiz

1. What is sentiment analysis?
   - a) Text translation
   - b) Emotion detection
   - c) Grammar checking
   - d) Word counting

2. Which feature is most important for sentiment analysis?
   - a) Text length
   - b) Word polarity
   - c) Word count
   - d) Punctuation

3. What is TF-IDF used for?
   - a) Text generation
   - b) Feature extraction
   - c) Sentiment scoring
   - d) Grammar checking

4. Which classifier is commonly used for text?
   - a) K-means
   - b) Naive Bayes
   - c) Decision Trees
   - d) KNN

5. What is feature engineering in NLP?
   - a) Text generation
   - b) Creating useful attributes
   - c) Model training
   - d) Data collection

6. Why use confusion matrix?
   - a) Feature selection
   - b) Performance evaluation
   - c) Text preprocessing
   - d) Model training

7. What is cross-validation used for?
   - a) Feature extraction
   - b) Model evaluation
   - c) Text cleaning
   - d) Sentiment analysis

8. Which metric is best for imbalanced data?
   - a) Accuracy
   - b) F1-score
   - c) Error rate
   - d) Loss function

9. What is overfitting in text classification?
   - a) Too few features
   - b) Too specific to training
   - c) Poor preprocessing
   - d) Wrong algorithm

10. Why normalize text features?
    - a) Increase accuracy
    - b) Scale consistency
    - c) Reduce memory
    - d) Speed up training

Answers: 1-b, 2-b, 3-b, 4-b, 5-b, 6-b, 7-b, 8-b, 9-b, 10-b