In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from collections import Counter, defaultdict
import warnings
warnings.filterwarnings('ignore')

# Text processing libraries
try:
    import nltk
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from nltk.stem import PorterStemmer, WordNetLemmatizer
    
    # Download required NLTK data
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    print("NLTK components loaded successfully")
except ImportError:
    print("NLTK not available, using basic text processing")

# Machine learning libraries
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Deep learning libraries
import tensorflow as tf
from tensorflow import keras

# Visualization
from wordcloud import WordCloud

# Set random seeds
np.random.seed(42)
tf.random.set_seed(42)

print("Libraries imported successfully!")
print(f"TensorFlow version: {tf.__version__}")

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
# Create comprehensive sentiment dataset
def create_sentiment_dataset():
    """
    Create a realistic sentiment dataset with various text patterns
    """
    
    # Positive sentiment examples
    positive_texts = [
        "I absolutely love this product! It exceeded all my expectations.",
        "Amazing quality and fast delivery. Highly recommend!",
        "This is the best purchase I've made in years. Perfect!",
        "Outstanding customer service and excellent product quality.",
        "I'm so happy with this purchase. Will definitely buy again!",
        "Fantastic! Everything arrived on time and in perfect condition.",
        "Great value for money. The product works perfectly.",
        "Superb quality and amazing features. Love it!",
        "Excellent product! Exactly what I was looking for.",
        "Perfect fit and great design. Very satisfied!",
        "This product is incredible! Exceeds expectations.",
        "Beautiful design and excellent functionality. Recommended!",
        "Top quality! Fast shipping and great packaging.",
        "Love the features and ease of use. Perfect!",
        "Outstanding value and performance. Very happy!",
        "Brilliant product! Works exactly as described.",
        "Impressive quality and attention to detail.",
        "Wonderful experience from start to finish!",
        "This is exactly what I needed. Perfect solution!",
        "Excellent craftsmanship and great customer support.",
        "Amazing results! Better than expected.",
        "High quality materials and excellent build.",
        "Perfect for my needs. Great functionality!",
        "Superb performance and reliable operation.",
        "Love the innovation and thoughtful design."
    ]
    
    # Negative sentiment examples
    negative_texts = [
        "Terrible product! Complete waste of money.",
        "Poor quality and disappointing performance. Avoid!",
        "This is the worst purchase I've ever made.",
        "Awful customer service and defective product.",
        "I hate this product. Nothing works as advertised.",
        "Horrible experience. Product broke immediately.",
        "Overpriced and poor quality. Very disappointed.",
        "Useless product with terrible design flaws.",
        "Don't buy this! It's a complete scam.",
        "Worst quality I've ever seen. Returned immediately.",
        "Broken on arrival and no customer support.",
        "This product is garbage. Save your money!",
        "Terrible build quality and false advertising.",
        "Complete disappointment. Nothing works properly.",
        "Poor materials and shoddy construction.",
        "Defective product with misleading description.",
        "Nightmare experience with this purchase.",
        "Cheap quality and overpriced. Avoid at all costs!",
        "Broken after one day. Terrible reliability.",
        "Worst customer service experience ever.",
        "Product failed completely. Total waste.",
        "Inferior quality and poor performance.",
        "Disappointing results and bad value.",
        "Unreliable and poorly designed product.",
        "Frustrated with poor quality and service."
    ]
    
    # Neutral sentiment examples
    neutral_texts = [
        "The product is okay. Nothing special but works.",
        "Average quality for the price. Could be better.",
        "It's fine. Does what it's supposed to do.",
        "Decent product with standard features.",
        "Regular quality item. Nothing extraordinary.",
        "Standard product with basic functionality.",
        "It works as described. No surprises.",
        "Average performance and typical quality.",
        "Basic product that meets minimum requirements.",
        "Normal quality for this price range.",
        "The product is acceptable but not outstanding.",
        "Standard features and regular performance.",
        "It's an ordinary product with basic design.",
        "Typical quality and standard delivery.",
        "Regular item with expected functionality.",
        "Average build quality and normal features.",
        "Standard product that works adequately.",
        "Basic design with typical performance.",
        "It's a regular product with normal quality.",
        "Standard functionality and average materials.",
        "Ordinary product with expected features.",
        "Normal quality and standard performance.",
        "Basic item that meets requirements.",
        "Average product with typical characteristics.",
        "Standard quality and regular functionality."
    ]
    
    # Create DataFrame
    texts = positive_texts + negative_texts + neutral_texts
    labels = (['positive'] * len(positive_texts) + 
              ['negative'] * len(negative_texts) + 
              ['neutral'] * len(neutral_texts))
    
    # Add some variation with mixed sentiments
    mixed_texts = [
        "Good product but expensive for what you get.",
        "Fast delivery but product quality could be better.",
        "Great customer service but the product is average.",
        "Love the design but functionality is limited.",
        "Excellent packaging but disappointing content.",
        "Quick shipping but poor build quality.",
        "Nice features but overpriced for the value.",
        "Good concept but poor execution.",
        "Beautiful appearance but lacks durability.",
        "Helpful support but defective product."
    ]
    
    # Add mixed sentiments (we'll classify these as neutral)
    texts.extend(mixed_texts)
    labels.extend(['neutral'] * len(mixed_texts))
    
    df = pd.DataFrame({
        'text': texts,
        'sentiment': labels
    })
    
    # Shuffle the dataset
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return df

# Create the dataset
sentiment_df = create_sentiment_dataset()

print(f"Dataset created with {len(sentiment_df)} samples")
print(f"Columns: {list(sentiment_df.columns)}")
print(f"\nClass distribution:")
print(sentiment_df['sentiment'].value_counts())

# Display first few examples
print(f"\nSample data:")
print(sentiment_df.head(10))


In [None]:
# Exploratory Data Analysis
def analyze_text_features(df):
    """
    Analyze various text features in the sentiment dataset
    """
    
    # Calculate text statistics
    df['text_length'] = df['text'].str.len()
    df['word_count'] = df['text'].str.split().str.len()
    df['sentence_count'] = df['text'].str.count('[.!?]') + 1
    df['exclamation_count'] = df['text'].str.count('!')
    df['question_count'] = df['text'].str.count('?')
    df['uppercase_ratio'] = df['text'].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0)
    
    return df

# Perform EDA
sentiment_df = analyze_text_features(sentiment_df)

# Visualize the dataset
plt.figure(figsize=(18, 12))

# Class distribution
plt.subplot(3, 4, 1)
sentiment_counts = sentiment_df['sentiment'].value_counts()
colors = ['lightgreen', 'lightcoral', 'lightblue']
plt.pie(sentiment_counts.values, labels=sentiment_counts.index, autopct='%1.1f%%', colors=colors)
plt.title('Sentiment Distribution')

# Text length distribution by sentiment
plt.subplot(3, 4, 2)
for sentiment in sentiment_df['sentiment'].unique():
    data = sentiment_df[sentiment_df['sentiment'] == sentiment]['text_length']
    plt.hist(data, alpha=0.6, label=sentiment, bins=15)
plt.xlabel('Text Length (characters)')
plt.ylabel('Frequency')
plt.title('Text Length Distribution by Sentiment')
plt.legend()

# Word count distribution
plt.subplot(3, 4, 3)
for sentiment in sentiment_df['sentiment'].unique():
    data = sentiment_df[sentiment_df['sentiment'] == sentiment]['word_count']
    plt.hist(data, alpha=0.6, label=sentiment, bins=15)
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.title('Word Count Distribution by Sentiment')
plt.legend()

# Exclamation marks usage
plt.subplot(3, 4, 4)
exclamation_by_sentiment = sentiment_df.groupby('sentiment')['exclamation_count'].mean()
bars = plt.bar(exclamation_by_sentiment.index, exclamation_by_sentiment.values, color=colors)
plt.title('Average Exclamation Marks by Sentiment')
plt.ylabel('Average Count')
plt.xticks(rotation=45)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{height:.2f}', ha='center', va='bottom')

# Uppercase ratio by sentiment
plt.subplot(3, 4, 5)
uppercase_by_sentiment = sentiment_df.groupby('sentiment')['uppercase_ratio'].mean()
plt.bar(uppercase_by_sentiment.index, uppercase_by_sentiment.values, color=colors)
plt.title('Average Uppercase Ratio by Sentiment')
plt.ylabel('Ratio')
plt.xticks(rotation=45)

# Box plot of text length by sentiment
plt.subplot(3, 4, 6)
sentiment_df.boxplot(column='text_length', by='sentiment', ax=plt.gca())
plt.title('Text Length Distribution by Sentiment')
plt.suptitle('')  # Remove automatic title

# Word frequency analysis
plt.subplot(3, 4, 7)
all_words = []
for text in sentiment_df['text']:
    words = text.lower().split()
    all_words.extend(words)

word_freq = Counter(all_words)
common_words = word_freq.most_common(15)
words, counts = zip(*common_words)

plt.barh(range(len(words)), counts)
plt.yticks(range(len(words)), words)
plt.xlabel('Frequency')
plt.title('Most Common Words')
plt.gca().invert_yaxis()

# Sentiment-specific word analysis
plt.subplot(3, 4, 8)
positive_words = []
negative_words = []
neutral_words = []

for idx, row in sentiment_df.iterrows():
    words = row['text'].lower().split()
    if row['sentiment'] == 'positive':
        positive_words.extend(words)
    elif row['sentiment'] == 'negative':
        negative_words.extend(words)
    else:
        neutral_words.extend(words)

pos_freq = Counter(positive_words).most_common(10)
neg_freq = Counter(negative_words).most_common(10)
neu_freq = Counter(neutral_words).most_common(10)

# Plot top positive words
pos_words, pos_counts = zip(*pos_freq)
y_pos = np.arange(len(pos_words))
plt.barh(y_pos, pos_counts, alpha=0.7, color='green')
plt.yticks(y_pos, pos_words)
plt.xlabel('Frequency')
plt.title('Top Words in Positive Reviews')
plt.gca().invert_yaxis()

# Statistical summary
plt.subplot(3, 4, 9)
stats_summary = sentiment_df.groupby('sentiment')[['text_length', 'word_count', 'exclamation_count']].mean()
stats_summary.plot(kind='bar', ax=plt.gca())
plt.title('Average Statistics by Sentiment')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Correlation matrix of features
plt.subplot(3, 4, 10)
feature_cols = ['text_length', 'word_count', 'exclamation_count', 'question_count', 'uppercase_ratio']
correlation_matrix = sentiment_df[feature_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, ax=plt.gca())
plt.title('Feature Correlation Matrix')

# Text complexity analysis
plt.subplot(3, 4, 11)
sentiment_df['avg_word_length'] = sentiment_df['text'].apply(
    lambda x: np.mean([len(word) for word in x.split()]) if x.split() else 0
)
complexity_by_sentiment = sentiment_df.groupby('sentiment')['avg_word_length'].mean()
plt.bar(complexity_by_sentiment.index, complexity_by_sentiment.values, color=colors)
plt.title('Average Word Length by Sentiment')
plt.ylabel('Average Word Length')
plt.xticks(rotation=45)

# Punctuation usage
plt.subplot(3, 4, 12)
sentiment_df['punctuation_ratio'] = sentiment_df['text'].apply(
    lambda x: sum(1 for c in x if c in string.punctuation) / len(x) if len(x) > 0 else 0
)
punct_by_sentiment = sentiment_df.groupby('sentiment')['punctuation_ratio'].mean()
plt.bar(punct_by_sentiment.index, punct_by_sentiment.values, color=colors)
plt.title('Average Punctuation Ratio by Sentiment')
plt.ylabel('Ratio')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# Print detailed statistics
print("\nDetailed Text Statistics by Sentiment:")
print("=" * 50)
summary_stats = sentiment_df.groupby('sentiment')[['text_length', 'word_count', 'exclamation_count', 
                                                   'uppercase_ratio', 'avg_word_length', 'punctuation_ratio']].describe()
print(summary_stats)

# Most discriminative words
print(f"\nMost Common Words by Sentiment:")
print(f"Positive: {[word for word, count in pos_freq[:5]]}")
print(f"Negative: {[word for word, count in neg_freq[:5]]}")
print(f"Neutral: {[word for word, count in neu_freq[:5]]}")


In [None]:
# Text Preprocessing Pipeline
class SentimentTextPreprocessor:
    """
    A comprehensive text preprocessing pipeline for sentiment analysis
    """
    
    def __init__(self, 
                 lowercase=True,
                 remove_punctuation=False,  # Keep punctuation for sentiment
                 remove_stopwords=False,    # Stopwords can be sentiment-relevant
                 stem_words=False,
                 lemmatize_words=False):
        
        self.lowercase = lowercase
        self.remove_punctuation = remove_punctuation
        self.remove_stopwords = remove_stopwords
        self.stem_words = stem_words
        self.lemmatize_words = lemmatize_words
        
        # Initialize NLTK components if available
        try:
            self.stop_words = set(stopwords.words('english'))
            self.stemmer = PorterStemmer()
            self.lemmatizer = WordNetLemmatizer()
            self.nltk_available = True
        except:
            self.nltk_available = False
            print("NLTK not available, using basic preprocessing")
    
    def clean_text(self, text):
        """
        Basic text cleaning while preserving sentiment information
        """
        # Convert to string if not already
        text = str(text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+', '', text)
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)
        
        # Handle contractions (expand them)
        contractions = {
            "won't": "will not",
            "can't": "cannot",
            "n't": " not",
            "'re": " are",
            "'ve": " have",
            "'ll": " will",
            "'d": " would",
            "'m": " am"
        }
        
        for contraction, expansion in contractions.items():
            text = text.replace(contraction, expansion)
        
        return text.strip()
    
    def preprocess(self, text):
        """
        Complete preprocessing pipeline
        """
        # Clean text
        text = self.clean_text(text)
        
        # Convert to lowercase
        if self.lowercase:
            text = text.lower()
        
        # Remove punctuation (optional - often kept for sentiment)
        if self.remove_punctuation:
            text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Tokenize
        if self.nltk_available:
            tokens = word_tokenize(text)
        else:
            tokens = text.split()
        
        # Remove stopwords (optional - often kept for sentiment)
        if self.remove_stopwords and self.nltk_available:
            tokens = [token for token in tokens if token not in self.stop_words]
        
        # Stemming
        if self.stem_words and self.nltk_available:
            tokens = [self.stemmer.stem(token) for token in tokens]
        
        # Lemmatization
        if self.lemmatize_words and self.nltk_available:
            tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        
        return ' '.join(tokens)
    
    def preprocess_dataset(self, texts):
        """
        Preprocess a list of texts
        """
        return [self.preprocess(text) for text in texts]

# Test different preprocessing strategies
preprocessing_strategies = {
    'minimal': SentimentTextPreprocessor(lowercase=True, remove_punctuation=False, remove_stopwords=False),
    'moderate': SentimentTextPreprocessor(lowercase=True, remove_punctuation=True, remove_stopwords=False),
    'aggressive': SentimentTextPreprocessor(lowercase=True, remove_punctuation=True, remove_stopwords=True, stem_words=True),
    'lemmatized': SentimentTextPreprocessor(lowercase=True, remove_punctuation=False, remove_stopwords=False, lemmatize_words=True)
}

# Apply different preprocessing strategies
sample_texts = sentiment_df['text'].head(5).tolist()

print("Original vs Preprocessed Text Examples:")
print("=" * 80)

for i, original_text in enumerate(sample_texts):
    print(f"\nExample {i+1}:")
    print(f"Original: {original_text}")
    
    for strategy_name, preprocessor in preprocessing_strategies.items():
        processed_text = preprocessor.preprocess(original_text)
        print(f"{strategy_name.capitalize():12}: {processed_text}")
    print("-" * 60)

# Analyze the impact of preprocessing on vocabulary size
print(f"\nVocabulary Size Analysis:")
print("-" * 30)

for strategy_name, preprocessor in preprocessing_strategies.items():
    processed_texts = preprocessor.preprocess_dataset(sentiment_df['text'].tolist())
    
    # Calculate vocabulary
    all_words = []
    for text in processed_texts:
        all_words.extend(text.split())
    
    vocab_size = len(set(all_words))
    avg_text_length = np.mean([len(text.split()) for text in processed_texts])
    
    print(f"{strategy_name.capitalize():12}: Vocab={vocab_size:4d}, Avg Length={avg_text_length:.1f}")

# Create processed versions for comparison
processed_datasets = {}
for strategy_name, preprocessor in preprocessing_strategies.items():
    processed_texts = preprocessor.preprocess_dataset(sentiment_df['text'].tolist())
    processed_datasets[strategy_name] = processed_texts

# Visualize preprocessing impact
plt.figure(figsize=(15, 10))

# Vocabulary size comparison
plt.subplot(2, 3, 1)
vocab_sizes = []
strategy_names = []
for strategy_name, texts in processed_datasets.items():
    all_words = []
    for text in texts:
        all_words.extend(text.split())
    vocab_sizes.append(len(set(all_words)))
    strategy_names.append(strategy_name)

plt.bar(strategy_names, vocab_sizes, color=['skyblue', 'lightcoral', 'lightgreen', 'orange'])
plt.title('Vocabulary Size by Preprocessing Strategy')
plt.ylabel('Vocabulary Size')
plt.xticks(rotation=45)

# Average text length comparison
plt.subplot(2, 3, 2)
avg_lengths = []
for strategy_name, texts in processed_datasets.items():
    avg_length = np.mean([len(text.split()) for text in texts])
    avg_lengths.append(avg_length)

plt.bar(strategy_names, avg_lengths, color=['skyblue', 'lightcoral', 'lightgreen', 'orange'])
plt.title('Average Text Length by Preprocessing')
plt.ylabel('Average Word Count')
plt.xticks(rotation=45)

# Character vs word count comparison
plt.subplot(2, 3, 3)
original_char_count = np.mean([len(text) for text in sentiment_df['text']])
processed_char_counts = []

for strategy_name, texts in processed_datasets.items():
    avg_char_count = np.mean([len(text) for text in texts])
    processed_char_counts.append(avg_char_count)

x = range(len(strategy_names))
plt.bar(x, processed_char_counts, alpha=0.7, label='Processed')
plt.axhline(y=original_char_count, color='red', linestyle='--', label='Original')
plt.xticks(x, strategy_names, rotation=45)
plt.title('Character Count Comparison')
plt.ylabel('Average Character Count')
plt.legend()

# Word frequency distribution changes
plt.subplot(2, 3, 4)
# Compare word frequencies for minimal vs aggressive preprocessing
minimal_words = []
aggressive_words = []

for text in processed_datasets['minimal']:
    minimal_words.extend(text.split())

for text in processed_datasets['aggressive']:
    aggressive_words.extend(text.split())

minimal_freq = Counter(minimal_words)
aggressive_freq = Counter(aggressive_words)

# Plot top 10 words for each
top_minimal = minimal_freq.most_common(10)
top_aggressive = aggressive_freq.most_common(10)

min_words, min_counts = zip(*top_minimal)
plt.barh(range(len(min_words)), min_counts, alpha=0.7, label='Minimal')
plt.yticks(range(len(min_words)), min_words)
plt.xlabel('Frequency')
plt.title('Top Words: Minimal Preprocessing')
plt.gca().invert_yaxis()

plt.subplot(2, 3, 5)
agg_words, agg_counts = zip(*top_aggressive)
plt.barh(range(len(agg_words)), agg_counts, alpha=0.7, label='Aggressive', color='orange')
plt.yticks(range(len(agg_words)), agg_words)
plt.xlabel('Frequency')
plt.title('Top Words: Aggressive Preprocessing')
plt.gca().invert_yaxis()

# Preprocessing time analysis (simulated)
plt.subplot(2, 3, 6)
# Simulate processing times (in practice, you'd measure actual times)
processing_times = [1.0, 1.2, 1.8, 1.5]  # Relative times
plt.bar(strategy_names, processing_times, color=['skyblue', 'lightcoral', 'lightgreen', 'orange'])
plt.title('Relative Processing Time')
plt.ylabel('Relative Time')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print(f"\nPreprocessing Strategy Recommendations:")
print("-" * 40)
print("1. Minimal: Best for preserving original sentiment signals")
print("2. Moderate: Good balance between noise reduction and signal preservation")
print("3. Aggressive: Smallest vocabulary but may lose important sentiment indicators")
print("4. Lemmatized: Good for reducing inflectional forms while preserving meaning")
print(f"\nFor sentiment analysis, 'minimal' or 'lemmatized' are often preferred.")
print("Punctuation and certain 'stopwords' can carry important sentiment information!")


In [None]:
# Baseline Models with Traditional Features
def build_baseline_models(texts, labels, test_size=0.2):
    """
    Build and evaluate baseline models using traditional NLP features
    """
    
    # Split the data
    X_train_text, X_test_text, y_train, y_test = train_test_split(
        texts, labels, test_size=test_size, random_state=42, stratify=labels
    )
    
    # Feature extraction methods
    feature_extractors = {
        'count_vectorizer': CountVectorizer(max_features=5000, ngram_range=(1, 2)),
        'tfidf_vectorizer': TfidfVectorizer(max_features=5000, ngram_range=(1, 2)),
        'tfidf_char': TfidfVectorizer(analyzer='char', ngram_range=(2, 4), max_features=5000)
    }
    
    # Machine learning models
    ml_models = {
        'naive_bayes': MultinomialNB(),
        'logistic_regression': LogisticRegression(random_state=42, max_iter=1000),
    }
    
    results = {}
    
    # Test each combination
    for feat_name, vectorizer in feature_extractors.items():
        print(f"\nTesting {feat_name}...")
        
        # Extract features
        X_train_features = vectorizer.fit_transform(X_train_text)
        X_test_features = vectorizer.transform(X_test_text)
        
        print(f"Feature matrix shape: {X_train_features.shape}")
        
        for model_name, model in ml_models.items():
            print(f"  Training {model_name}...")
            
            # Train model
            model.fit(X_train_features, y_train)
            
            # Make predictions
            y_pred = model.predict(X_test_features)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            
            # Store results
            combo_name = f"{feat_name}_{model_name}"
            results[combo_name] = {
                'accuracy': accuracy,
                'y_true': y_test,
                'y_pred': y_pred,
                'model': model,
                'vectorizer': vectorizer
            }
            
            print(f"    Accuracy: {accuracy:.4f}")
    
    return results, X_train_text, X_test_text, y_train, y_test

# Prepare data for baseline models
preprocessor = preprocessing_strategies['minimal']  # Use minimal preprocessing
processed_texts = preprocessor.preprocess_dataset(sentiment_df['text'].tolist())
labels = sentiment_df['sentiment'].tolist()

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

print("Label encoding:")
for i, label in enumerate(label_encoder.classes_):
    print(f"{i}: {label}")

# Build baseline models
baseline_results, X_train_text, X_test_text, y_train, y_test = build_baseline_models(
    processed_texts, encoded_labels
)

# Analyze baseline results
print(f"\nBaseline Model Performance Summary:")
print("=" * 50)

best_accuracy = 0
best_model_name = ""

for model_name, result in baseline_results.items():
    accuracy = result['accuracy']
    print(f"{model_name:30s}: {accuracy:.4f}")
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model_name = model_name

print(f"\nBest baseline model: {best_model_name} with accuracy {best_accuracy:.4f}")

# Detailed analysis of best model
best_result = baseline_results[best_model_name]
print(f"\nDetailed Classification Report for {best_model_name}:")
print(classification_report(best_result['y_true'], best_result['y_pred'], 
                          target_names=label_encoder.classes_))

# Visualize baseline results
plt.figure(figsize=(15, 10))

# Model performance comparison
plt.subplot(2, 3, 1)
model_names = list(baseline_results.keys())
accuracies = [baseline_results[name]['accuracy'] for name in model_names]

bars = plt.bar(range(len(model_names)), accuracies, color=['skyblue', 'lightcoral', 'lightgreen', 'orange', 'pink', 'yellow'])
plt.xticks(range(len(model_names)), [name.replace('_', '\n') for name in model_names], rotation=45, ha='right')
plt.title('Baseline Model Accuracies')
plt.ylabel('Accuracy')
plt.ylim(0, 1)

# Add value labels on bars
for bar, acc in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f'{acc:.3f}', ha='center', va='bottom')

# Confusion matrix for best model
plt.subplot(2, 3, 2)
cm = confusion_matrix(best_result['y_true'], best_result['y_pred'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=label_encoder.classes_, 
            yticklabels=label_encoder.classes_)
plt.title(f'Confusion Matrix - {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')

# Feature importance analysis (for logistic regression)
plt.subplot(2, 3, 3)
if 'logistic_regression' in best_model_name:
    model = best_result['model']
    vectorizer = best_result['vectorizer']
    
    # Get feature names
    feature_names = vectorizer.get_feature_names_out()
    
    # Get coefficients for each class
    if hasattr(model, 'coef_'):
        # For multi-class, we'll show coefficients for the positive class
        coef = model.coef_[0] if len(model.coef_) == 1 else model.coef_[2]  # Positive class
        
        # Get top positive and negative features
        top_pos_idx = np.argsort(coef)[-10:]
        top_neg_idx = np.argsort(coef)[:10]
        
        top_features = np.concatenate([top_neg_idx, top_pos_idx])
        top_coefs = coef[top_features]
        top_names = [feature_names[i] for i in top_features]
        
        colors = ['red' if c < 0 else 'green' for c in top_coefs]
        plt.barh(range(len(top_names)), top_coefs, color=colors, alpha=0.7)
        plt.yticks(range(len(top_names)), top_names)
        plt.title('Top Features (Logistic Regression)')
        plt.xlabel('Coefficient Value')

# Error analysis
plt.subplot(2, 3, 4)
# Analyze misclassifications
correct_predictions = best_result['y_true'] == best_result['y_pred']
error_rate_by_class = []
class_names = label_encoder.classes_

for i, class_name in enumerate(class_names):
    class_mask = best_result['y_true'] == i
    class_correct = correct_predictions[class_mask]
    error_rate = 1 - np.mean(class_correct) if len(class_correct) > 0 else 0
    error_rate_by_class.append(error_rate)

plt.bar(class_names, error_rate_by_class, color=['lightgreen', 'lightcoral', 'lightblue'])
plt.title('Error Rate by Class')
plt.ylabel('Error Rate')
plt.xticks(rotation=45)

# Performance vs feature dimension
plt.subplot(2, 3, 5)
feature_dims = []
performances = []

for name, result in baseline_results.items():
    if 'tfidf_vectorizer' in name:  # Focus on TF-IDF results
        vectorizer = result['vectorizer']
        feature_dims.append(len(vectorizer.get_feature_names_out()))
        performances.append(result['accuracy'])

if feature_dims:
    plt.scatter(feature_dims, performances, alpha=0.7)
    plt.xlabel('Number of Features')
    plt.ylabel('Accuracy')
    plt.title('Performance vs Feature Dimension')
    
    for i, name in enumerate([k for k in baseline_results.keys() if 'tfidf_vectorizer' in k]):
        plt.annotate(name.split('_')[2], (feature_dims[i], performances[i]))

# Class distribution in predictions
plt.subplot(2, 3, 6)
true_dist = np.bincount(best_result['y_true'])
pred_dist = np.bincount(best_result['y_pred'])

x = np.arange(len(class_names))
width = 0.35

plt.bar(x - width/2, true_dist, width, label='True', alpha=0.7)
plt.bar(x + width/2, pred_dist, width, label='Predicted', alpha=0.7)
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('True vs Predicted Class Distribution')
plt.xticks(x, class_names)
plt.legend()

plt.tight_layout()
plt.show()

# Generate some predictions for interpretation
print(f"\nSample Predictions from Best Model ({best_model_name}):")
print("-" * 60)

# Show some correct and incorrect predictions
correct_indices = np.where(best_result['y_true'] == best_result['y_pred'])[0]
incorrect_indices = np.where(best_result['y_true'] != best_result['y_pred'])[0]

# Show 3 correct predictions
print("Correct Predictions:")
for i, idx in enumerate(correct_indices[:3]):
    text_idx = idx  # This maps to test set
    true_label = label_encoder.classes_[best_result['y_true'][idx]]
    pred_label = label_encoder.classes_[best_result['y_pred'][idx]]
    original_text = X_test_text[idx]
    
    print(f"{i+1}. Text: '{original_text[:60]}...'")
    print(f"   True: {true_label}, Predicted: {pred_label}")

print(f"\nIncorrect Predictions:")
for i, idx in enumerate(incorrect_indices[:3]):
    true_label = label_encoder.classes_[best_result['y_true'][idx]]
    pred_label = label_encoder.classes_[best_result['y_pred'][idx]]
    original_text = X_test_text[idx]
    
    print(f"{i+1}. Text: '{original_text[:60]}...'")
    print(f"   True: {true_label}, Predicted: {pred_label}")

print(f"\nBaseline Performance Summary:")
print(f"Best accuracy: {best_accuracy:.4f}")
print(f"Random baseline: {1/len(label_encoder.classes_):.4f}")
print(f"Improvement over random: {(best_accuracy - 1/len(label_encoder.classes_)) / (1/len(label_encoder.classes_)) * 100:.1f}%")

print(f"\nReady for RNN-based models! Target to beat: {best_accuracy:.4f}")
