In [None]:
# Cell 1 - Setup and Imports


In [None]:
# Install required packages
!pip install --upgrade pip setuptools wheel -q
!pip install scikit-learn pandas numpy matplotlib seaborn tqdm -q
!pip install nltk -q
!pip install imbalanced-learn -q  # For SMOTE

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

# ML Models
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# Feature selection
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

# Imbalanced data handling
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# NLP Processing
import re
import string
from collections import Counter, defaultdict
import nltk

# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# System utilities
import gc
import json
import random
from datetime import datetime
import pickle

# Set seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(42)

# Create directories
for dir_name in ['plots', 'models', 'results']:
    os.makedirs(dir_name, exist_ok=True)

print("Setup completed successfully!")

# Cell 2 - Load and Analyze Dataset


In [None]:
# Load dataset
df = pd.read_csv('/kaggle/input/final-dataset/final-dataset.csv')
print(f"Original dataset shape: {df.shape}")

# Check unique polarities to determine number of classes
unique_polarities = df['Polarity'].unique()
print(f"\nUnique polarities found: {unique_polarities}")

# Map polarities based on what's in the dataset
if 'neutral' in unique_polarities:
    # 3-class case
    num_classes = 3
    df['Polarity'] = df['Polarity'].map({'positive': 1, 'negative': 0, 'neutral': 2})
    class_names = {0: 'Negative', 1: 'Positive', 2: 'Neutral'}
    print("\nDetected 3-class sentiment analysis")
else:
    # 2-class case
    num_classes = 2
    df['Polarity'] = df['Polarity'].map({'positive': 1, 'negative': 0})
    class_names = {0: 'Negative', 1: 'Positive'}
    print("\nDetected 2-class sentiment analysis")

print(f"Number of classes: {num_classes}")
print("\nClass distribution:")
class_dist = df['Polarity'].value_counts().sort_index()
print(class_dist)
print("\nClass percentages:")
print(df['Polarity'].value_counts(normalize=True).sort_index() * 100)

# Add text statistics
df['text_length'] = df['Text'].str.len()
df['word_count'] = df['Text'].str.split().str.len()
df['unique_words'] = df['Text'].apply(lambda x: len(set(str(x).split())))

print(f"\nText statistics:")
print(f"Average text length: {df['text_length'].mean():.1f} characters")
print(f"Average word count: {df['word_count'].mean():.1f} words")
print(f"Average unique words: {df['unique_words'].mean():.1f} words")

# Cell 3 - Advanced Text Preprocessing


In [None]:
class BanglaTextPreprocessor:
    """Advanced text preprocessing for Bangla text"""
    
    def __init__(self):
        # Common Bangla punctuation
        self.bangla_punct = '।॥,;:!?\'"-.…'
        
        # Common Bangla stopwords (expand this list based on your needs)
        self.bangla_stopwords = {
            'এবং', 'আর', 'কিন্তু', 'তবে', 'যদি', 'তাহলে', 'যে', 'সে', 
            'এই', 'সেই', 'ঐ', 'তার', 'তাদের', 'আমার', 'আমাদের',
            'হয়', 'হবে', 'হল', 'করে', 'করা', 'করেন', 'করি', 'করেছে',
            'ছিল', 'আছে', 'থাকে', 'যায়', 'যাবে', 'গেল', 'এর', 'তে',
            'কে', 'না', 'নি', 'নেই', 'তো', 'ও', 'আর', 'কি', 'যা',
            'জন্য', 'মত', 'সব', 'সে', 'এ', 'য়', 'র', 'ই'
        }
        
    def clean_text(self, text):
        """Comprehensive cleaning for Bangla text"""
        # Convert to string
        text = str(text)
        
        # Convert to lowercase (if needed - be careful with Bangla)
        # text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www.\S+', '', text)
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove English characters but keep Bangla
        text = re.sub(r'[a-zA-Z]+', '', text)
        
        # Remove numbers (optional - comment out if numbers are important)
        text = re.sub(r'\d+', '', text)
        
        # Keep only Bangla characters and basic punctuation
        text = re.sub(r'[^\u0980-\u09FF\s।,!?.-]', ' ', text)
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        # Remove punctuation at the beginning and end
        text = text.strip('।,!?.-')
        
        return text.strip()
    
    def tokenize(self, text):
        """Simple word tokenization for Bangla"""
        # Clean text first
        text = self.clean_text(text)
        
        # Simple split-based tokenization
        tokens = text.split()
        
        # Remove very short tokens
        tokens = [token for token in tokens if len(token) > 1]
        
        return tokens
    
    def remove_stopwords(self, tokens):
        """Remove stopwords from token list"""
        return [token for token in tokens if token not in self.bangla_stopwords]
    
    def preprocess(self, text, remove_stop=True):
        """Complete preprocessing pipeline"""
        # Clean text
        text = self.clean_text(text)
        
        # Tokenize
        tokens = self.tokenize(text)
        
        # Remove stopwords if requested
        if remove_stop:
            tokens = self.remove_stopwords(tokens)
        
        # Join tokens back
        return ' '.join(tokens)
    
    def get_char_ngrams(self, text, n=3):
        """Extract character n-grams"""
        text = self.clean_text(text)
        ngrams = []
        for i in range(len(text) - n + 1):
            ngrams.append(text[i:i+n])
        return ' '.join(ngrams)

# Initialize preprocessor
preprocessor = BanglaTextPreprocessor()

# Test preprocessing
print("Preprocessing examples:")
sample_texts = df['Text'].head(3).values
for i, text in enumerate(sample_texts):
    print(f"\nOriginal {i+1}: {text[:100]}...")
    processed = preprocessor.preprocess(text)
    print(f"Processed {i+1}: {processed[:100]}...")

# Cell 4 - Feature Engineering


In [None]:
def create_features(df, preprocessor):
    """Create various features for ML models"""
    
    print("Creating features...")
    
    # Basic text features
    df['processed_text'] = df['Text'].apply(lambda x: preprocessor.preprocess(x, remove_stop=True))
    df['processed_text_with_stop'] = df['Text'].apply(lambda x: preprocessor.preprocess(x, remove_stop=False))
    
    # Character n-grams
    df['char_trigrams'] = df['Text'].apply(lambda x: preprocessor.get_char_ngrams(x, n=3))
    df['char_bigrams'] = df['Text'].apply(lambda x: preprocessor.get_char_ngrams(x, n=2))
    
    # Statistical features
    df['char_count'] = df['Text'].str.len()
    df['word_count'] = df['processed_text'].str.split().str.len()
    df['unique_word_count'] = df['processed_text'].apply(lambda x: len(set(x.split())))
    df['word_diversity'] = df['unique_word_count'] / (df['word_count'] + 1)  # +1 to avoid division by zero
    
    # Punctuation features
    df['exclamation_count'] = df['Text'].str.count('!')
    df['question_count'] = df['Text'].str.count('[?।]')
    df['punctuation_count'] = df['Text'].apply(lambda x: sum(1 for c in x if c in '।!?,.-'))
    
    # Emoticon-like patterns (simplified)
    df['happy_emoticon'] = df['Text'].str.count('[:;]-?[)D]')
    df['sad_emoticon'] = df['Text'].str.count('[:;]-?[(]')
    
    # Average word length
    df['avg_word_length'] = df['processed_text'].apply(
        lambda x: np.mean([len(word) for word in x.split()]) if x else 0
    )
    
    # Capitalization features (might not be as relevant for Bangla)
    # df['capital_ratio'] = df['Text'].apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x) + 1))
    
    print(f"Created {len(df.columns)} features")
    return df

# Create features
df = create_features(df, preprocessor)

# Display feature statistics
print("\nFeature statistics:")
numerical_features = ['char_count', 'word_count', 'unique_word_count', 'word_diversity', 
                     'exclamation_count', 'question_count', 'punctuation_count', 
                     'avg_word_length']
print(df[numerical_features].describe())

# Cell 5 - Feature Extraction Methods


In [None]:
class FeatureExtractor:
    """Multiple feature extraction methods for text"""
    
    def __init__(self, max_features=5000):
        self.max_features = max_features
        self.vectorizers = {}
        
    def fit_transform_features(self, X_train, X_val, X_test, feature_type='tfidf'):
        """Extract features using specified method"""
        
        if feature_type == 'tfidf':
            # TF-IDF with word-level features
            vectorizer = TfidfVectorizer(
                max_features=self.max_features,
                ngram_range=(1, 3),  # Unigrams, bigrams, and trigrams
                min_df=2,
                max_df=0.95,
                use_idf=True,
                smooth_idf=True,
                sublinear_tf=True  # Use log(TF)
            )
            
        elif feature_type == 'tfidf_char':
            # Character-level TF-IDF
            vectorizer = TfidfVectorizer(
                max_features=self.max_features,
                analyzer='char',
                ngram_range=(2, 5),  # Character n-grams
                min_df=2,
                max_df=0.95,
                use_idf=True,
                smooth_idf=True,
                sublinear_tf=True
            )
            
        elif feature_type == 'count':
            # Count vectorizer
            vectorizer = CountVectorizer(
                max_features=self.max_features,
                ngram_range=(1, 3),
                min_df=2,
                max_df=0.95,
                binary=False
            )
            
        elif feature_type == 'binary':
            # Binary bag of words
            vectorizer = CountVectorizer(
                max_features=self.max_features,
                ngram_range=(1, 2),
                min_df=2,
                max_df=0.95,
                binary=True
            )
        
        # Fit on training data and transform all sets
        X_train_vec = vectorizer.fit_transform(X_train)
        X_val_vec = vectorizer.transform(X_val)
        X_test_vec = vectorizer.transform(X_test)
        
        self.vectorizers[feature_type] = vectorizer
        
        print(f"{feature_type} features shape: {X_train_vec.shape}")
        
        return X_train_vec, X_val_vec, X_test_vec
    
    def combine_features(self, text_features, numerical_features):
        """Combine text features with numerical features"""
        from scipy.sparse import hstack
        
        # Convert numerical features to sparse matrix
        from scipy.sparse import csr_matrix
        numerical_sparse = csr_matrix(numerical_features)
        
        # Combine
        combined = hstack([text_features, numerical_sparse])
        
        return combined

# Initialize feature extractor
feature_extractor = FeatureExtractor(max_features=5000)

# Cell 6 - Train/Test Split and Feature Preparation


In [None]:
# Prepare data splits
def prepare_data_splits(df, test_size=0.2, val_size=0.1):
    """Prepare train, validation, and test splits"""
    
    # Features and labels
    X_text = df['processed_text'].values
    X_text_with_stop = df['processed_text_with_stop'].values
    X_char = df['char_trigrams'].values
    
    # Numerical features
    numerical_cols = ['char_count', 'word_count', 'unique_word_count', 'word_diversity',
                      'exclamation_count', 'question_count', 'punctuation_count', 'avg_word_length']
    X_numerical = df[numerical_cols].values
    
    y = df['Polarity'].values
    
    # First split: train+val and test
    X_text_temp, X_text_test, X_numerical_temp, X_numerical_test, y_temp, y_test = train_test_split(
        X_text, X_numerical, y, test_size=test_size, stratify=y, random_state=42
    )
    
    # Also split other text representations
    X_text_with_stop_temp, X_text_with_stop_test = train_test_split(
        X_text_with_stop, test_size=test_size, stratify=y, random_state=42
    )[0:2]
    
    X_char_temp, X_char_test = train_test_split(
        X_char, test_size=test_size, stratify=y, random_state=42
    )[0:2]
    
    # Second split: train and validation
    val_size_adjusted = val_size / (1 - test_size)
    
    splits = train_test_split(
        X_text_temp, X_text_with_stop_temp, X_char_temp, X_numerical_temp, y_temp,
        test_size=val_size_adjusted, stratify=y_temp, random_state=42
    )
    
    X_text_train = splits[0]
    X_text_val = splits[1]
    X_text_with_stop_train = splits[2]
    X_text_with_stop_val = splits[3]
    X_char_train = splits[4]
    X_char_val = splits[5]
    X_numerical_train = splits[6]
    X_numerical_val = splits[7]
    y_train = splits[8]
    y_val = splits[9]
    
    print("Data split sizes:")
    print(f"Train: {len(y_train)} samples")
    print(f"Val: {len(y_val)} samples")
    print(f"Test: {len(y_test)} samples")
    
    print("\nClass distribution:")
    print(f"Train: {np.bincount(y_train)}")
    print(f"Val: {np.bincount(y_val)}")
    print(f"Test: {np.bincount(y_test)}")
    
    return {
        'text': (X_text_train, X_text_val, X_text_test),
        'text_with_stop': (X_text_with_stop_train, X_text_with_stop_val, X_text_with_stop_test),
        'char': (X_char_train, X_char_val, X_char_test),
        'numerical': (X_numerical_train, X_numerical_val, X_numerical_test),
        'labels': (y_train, y_val, y_test)
    }

# Prepare data
data_splits = prepare_data_splits(df)
y_train, y_val, y_test = data_splits['labels']

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
print(f"\nClass weights: {class_weight_dict}")

# Cell 7 - Extract Features


In [None]:
# Extract different types of features
print("Extracting features...")
print("="*50)

# 1. TF-IDF features
print("\n1. TF-IDF Word Features:")
X_train_tfidf, X_val_tfidf, X_test_tfidf = feature_extractor.fit_transform_features(
    *data_splits['text'], feature_type='tfidf'
)

# 2. TF-IDF character features
print("\n2. TF-IDF Character Features:")
X_train_tfidf_char, X_val_tfidf_char, X_test_tfidf_char = feature_extractor.fit_transform_features(
    *data_splits['char'], feature_type='tfidf_char'
)

# 3. Count features
print("\n3. Count Features:")
X_train_count, X_val_count, X_test_count = feature_extractor.fit_transform_features(
    *data_splits['text_with_stop'], feature_type='count'
)

# 4. Combine features
print("\n4. Combining Features:")

# Scale numerical features
scaler = StandardScaler()
X_numerical_train_scaled = scaler.fit_transform(data_splits['numerical'][0])
X_numerical_val_scaled = scaler.transform(data_splits['numerical'][1])
X_numerical_test_scaled = scaler.transform(data_splits['numerical'][2])

# Combine TF-IDF with numerical features
X_train_combined = feature_extractor.combine_features(X_train_tfidf, X_numerical_train_scaled)
X_val_combined = feature_extractor.combine_features(X_val_tfidf, X_numerical_val_scaled)
X_test_combined = feature_extractor.combine_features(X_test_tfidf, X_numerical_test_scaled)

print(f"Combined features shape: {X_train_combined.shape}")

# Store all feature sets
feature_sets = {
    'tfidf': (X_train_tfidf, X_val_tfidf, X_test_tfidf),
    'tfidf_char': (X_train_tfidf_char, X_val_tfidf_char, X_test_tfidf_char),
    'count': (X_train_count, X_val_count, X_test_count),
    'combined': (X_train_combined, X_val_combined, X_test_combined)
}

# Cell 8 - SVM Implementation


In [None]:
def train_svm_models(feature_sets, y_train, y_val, y_test, class_weight_dict):
    """Train different SVM variants"""
    
    results = {}
    
    # SVM configurations
    svm_configs = {
        'LinearSVC': {
            'model': LinearSVC(class_weight=class_weight_dict, random_state=42, max_iter=10000),
            'param_grid': {
                'C': [0.01, 0.1, 1.0, 10.0],
                'penalty': ['l2'],
                'loss': ['squared_hinge']
            }
        },
        'SVC_RBF': {
            'model': SVC(kernel='rbf', class_weight=class_weight_dict, random_state=42, 
                        probability=True, cache_size=2000),
            'param_grid': {
                'C': [0.1, 1.0, 10.0],
                'gamma': ['scale', 'auto', 0.001, 0.01]
            }
        }
    }
    
    for feature_name, (X_train, X_val, X_test) in feature_sets.items():
        print(f"\n{'='*60}")
        print(f"Training SVM models with {feature_name} features")
        print(f"{'='*60}")
        
        results[feature_name] = {}
        
        for svm_name, config in svm_configs.items():
            print(f"\n{svm_name}:")
            
            # Use smaller dataset for RBF kernel to speed up training
            if svm_name == 'SVC_RBF' and X_train.shape[0] > 3000:
                print("Using subset for RBF kernel due to computational constraints...")
                subset_idx = np.random.choice(X_train.shape[0], 3000, replace=False)
                X_train_subset = X_train[subset_idx]
                y_train_subset = y_train[subset_idx]
            else:
                X_train_subset = X_train
                y_train_subset = y_train
            
            # Grid search with cross-validation
            cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
            
            grid_search = GridSearchCV(
                config['model'],
                config['param_grid'],
                cv=cv,
                scoring='f1_weighted',
                n_jobs=-1,
                verbose=1
            )
            
            # Train
            grid_search.fit(X_train_subset, y_train_subset)
            
            # Best model
            best_model = grid_search.best_estimator_
            
            # Predictions
            y_val_pred = best_model.predict(X_val)
            y_test_pred = best_model.predict(X_test)
            
            # Metrics
            val_acc = accuracy_score(y_val, y_val_pred)
            val_f1 = f1_score(y_val, y_val_pred, average='weighted')
            test_acc = accuracy_score(y_test, y_test_pred)
            test_f1 = f1_score(y_test, y_test_pred, average='weighted')
            
            print(f"Best params: {grid_search.best_params_}")
            print(f"Val - Accuracy: {val_acc:.4f}, F1: {val_f1:.4f}")
            print(f"Test - Accuracy: {test_acc:.4f}, F1: {test_f1:.4f}")
            
            # Store results
            results[feature_name][svm_name] = {
                'model': best_model,
                'best_params': grid_search.best_params_,
                'val_acc': val_acc,
                'val_f1': val_f1,
                'test_acc': test_acc,
                'test_f1': test_f1,
                'y_test_pred': y_test_pred
            }
    
    return results

# Train SVM models (using only best feature sets to save time)
best_feature_sets = {
    'tfidf': feature_sets['tfidf'],
    'combined': feature_sets['combined']
}

print("Training SVM models...")
svm_results = train_svm_models(best_feature_sets, y_train, y_val, y_test, class_weight_dict)

# Cell 9 - Random Forest Implementation


In [None]:
def train_random_forest_models(feature_sets, y_train, y_val, y_test, class_weight_dict):
    """Train Random Forest models with different configurations"""
    
    results = {}
    
    for feature_name, (X_train, X_val, X_test) in feature_sets.items():
        print(f"\n{'='*60}")
        print(f"Training Random Forest with {feature_name} features")
        print(f"{'='*60}")
        
        # Random Forest with grid search
        rf = RandomForestClassifier(
            class_weight=class_weight_dict,
            random_state=42,
            n_jobs=-1
        )
        
        # Parameter grid
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [10, 20, 30, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 'log2', None]
        }
        
        # Simplified grid for faster training
        param_grid_simple = {
            'n_estimators': [100, 200],
            'max_depth': [20, 30],
            'min_samples_split': [5, 10],
            'min_samples_leaf': [2, 4],
            'max_features': ['sqrt', None]
        }
        
        # Grid search
        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        
        grid_search = GridSearchCV(
            rf,
            param_grid_simple,
            cv=cv,
            scoring='f1_weighted',
            n_jobs=-1,
            verbose=1
        )
        
        # Train
        grid_search.fit(X_train, y_train)
        
        # Best model
        best_rf = grid_search.best_estimator_
        
        # Predictions
        y_val_pred = best_rf.predict(X_val)
        y_test_pred = best_rf.predict(X_test)
        
        # Get prediction probabilities
        y_test_proba = best_rf.predict_proba(X_test)
        
        # Metrics
        val_acc = accuracy_score(y_val, y_val_pred)
        val_f1 = f1_score(y_val, y_val_pred, average='weighted')
        test_acc = accuracy_score(y_test, y_test_pred)
        test_f1 = f1_score(y_test, y_test_pred, average='weighted')
        
        print(f"Best params: {grid_search.best_params_}")
        print(f"Val - Accuracy: {val_acc:.4f}, F1: {val_f1:.4f}")
        print(f"Test - Accuracy: {test_acc:.4f}, F1: {test_f1:.4f}")
        
        # Feature importance
        feature_importance = best_rf.feature_importances_
        
        # Store results
        results[feature_name] = {
            'model': best_rf,
            'best_params': grid_search.best_params_,
            'val_acc': val_acc,
            'val_f1': val_f1,
            'test_acc': test_acc,
            'test_f1': test_f1,
            'y_test_pred': y_test_pred,
            'y_test_proba': y_test_proba,
            'feature_importance': feature_importance
        }
    
    return results

# Train Random Forest models
print("Training Random Forest models...")
rf_results = train_random_forest_models(best_feature_sets, y_train, y_val, y_test, class_weight_dict)

# Cell 10 - Additional ML Models (Quick Training)


In [None]:
def train_additional_models(X_train, X_val, X_test, y_train, y_val, y_test, class_weight_dict):
    """Train additional ML models for comparison"""
    
    models = {
        'Logistic Regression': LogisticRegression(
            class_weight=class_weight_dict,
            max_iter=1000,
            random_state=42,
            solver='liblinear'
        ),
        'Multinomial NB': MultinomialNB(alpha=1.0),
        'Extra Trees': ExtraTreesClassifier(
            n_estimators=100,
            class_weight=class_weight_dict,
            random_state=42,
            n_jobs=-1
        )
    }
    
    results = {}
    
    for name, model in models.items():
        print(f"\nTraining {name}...")
        
        # Train
        model.fit(X_train, y_train)
        
        # Predictions
        y_val_pred = model.predict(X_val)
        y_test_pred = model.predict(X_test)
        
        # Metrics
        val_acc = accuracy_score(y_val, y_val_pred)
        val_f1 = f1_score(y_val, y_val_pred, average='weighted')
        test_acc = accuracy_score(y_test, y_test_pred)
        test_f1 = f1_score(y_test, y_test_pred, average='weighted')
        
        print(f"Val - Accuracy: {val_acc:.4f}, F1: {val_f1:.4f}")
        print(f"Test - Accuracy: {test_acc:.4f}, F1: {test_f1:.4f}")
        
        results[name] = {
            'model': model,
            'val_acc': val_acc,
            'val_f1': val_f1,
            'test_acc': test_acc,
            'test_f1': test_f1,
            'y_test_pred': y_test_pred
        }
    
    return results

# Train additional models with best features
print("\nTraining additional ML models...")
print("="*60)
X_train_best, X_val_best, X_test_best = feature_sets['combined']
additional_results = train_additional_models(
    X_train_best, X_val_best, X_test_best,
    y_train, y_val, y_test, class_weight_dict
)

# Cell 11 - Model Comparison and Visualization


In [None]:
def compare_all_models(svm_results, rf_results, additional_results, y_test):
    """Compare all trained models"""
    
    # Collect all results
    all_results = []
    
    # SVM results
    for feature_name, svm_dict in svm_results.items():
        for svm_type, metrics in svm_dict.items():
            all_results.append({
                'Model': f'SVM {svm_type}',
                'Features': feature_name,
                'Test Accuracy': metrics['test_acc'],
                'Test F1': metrics['test_f1'],
                'Val Accuracy': metrics['val_acc'],
                'Val F1': metrics['val_f1']
            })
    
    # Random Forest results
    for feature_name, metrics in rf_results.items():
        all_results.append({
            'Model': 'Random Forest',
            'Features': feature_name,
            'Test Accuracy': metrics['test_acc'],
            'Test F1': metrics['test_f1'],
            'Val Accuracy': metrics['val_acc'],
            'Val F1': metrics['val_f1']
        })
    
    # Additional models
    for model_name, metrics in additional_results.items():
        all_results.append({
            'Model': model_name,
            'Features': 'combined',
            'Test Accuracy': metrics['test_acc'],
            'Test F1': metrics['test_f1'],
            'Val Accuracy': metrics['val_acc'],
            'Val F1': metrics['val_f1']
        })
    
    # Create DataFrame
    results_df = pd.DataFrame(all_results)
    results_df = results_df.sort_values('Test F1', ascending=False)
    
    print("\n" + "="*80)
    print("MODEL COMPARISON RESULTS")
    print("="*80)
    print(results_df.to_string(index=False))
    
    # Visualization
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 1. Model comparison bar plot
    ax = axes[0, 0]
    x = np.arange(len(results_df))
    width = 0.35
    
    ax.bar(x - width/2, results_df['Test Accuracy'], width, label='Accuracy', color='skyblue')
    ax.bar(x + width/2, results_df['Test F1'], width, label='F1 Score', color='lightcoral')
    
    ax.set_xlabel('Models')
    ax.set_ylabel('Score')
    ax.set_title('Model Performance Comparison')
    ax.set_xticks(x)
    ax.set_xticklabels([f"{row['Model']}\n({row['Features']})" for _, row in results_df.iterrows()], 
                       rotation=45, ha='right')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # 2. Best model confusion matrix
    best_model_name = results_df.iloc[0]['Model']
    best_feature = results_df.iloc[0]['Features']
    
    # Get predictions for best model
    if 'SVM' in best_model_name:
        svm_type = best_model_name.split()[-1]
        y_pred = svm_results[best_feature][svm_type]['y_test_pred']
    elif best_model_name == 'Random Forest':
        y_pred = rf_results[best_feature]['y_test_pred']
    else:
        y_pred = additional_results[best_model_name]['y_test_pred']
    
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, 1],
                xticklabels=[class_names[i] for i in range(num_classes)],
                yticklabels=[class_names[i] for i in range(num_classes)])
    axes[0, 1].set_title(f'Best Model Confusion Matrix\n({best_model_name} with {best_feature})')
    axes[0, 1].set_ylabel('True Label')
    axes[0, 1].set_xlabel('Predicted Label')
    
    # 3. Feature importance (if Random Forest)
    if 'Random Forest' in [row['Model'] for _, row in results_df.iterrows()]:
        ax = axes[1, 0]
        
        # Get feature importance
        rf_feature = 'combined' if 'combined' in rf_results else 'tfidf'
        feature_importance = rf_results[rf_feature]['feature_importance']
        
        # Get top 20 features
        top_indices = np.argsort(feature_importance)[-20:]
        top_importance = feature_importance[top_indices]
        
        ax.barh(np.arange(len(top_importance)), top_importance)
        ax.set_xlabel('Feature Importance')
        ax.set_title('Top 20 Feature Importances (Random Forest)')
        ax.set_ylabel('Feature Index')
    else:
        axes[1, 0].text(0.5, 0.5, 'Feature importance not available', 
                        ha='center', va='center', transform=axes[1, 0].transAxes)
        axes[1, 0].set_title('Feature Importance')
    
    # 4. Model comparison table
    ax = axes[1, 1]
    ax.axis('tight')
    ax.axis('off')
    
    # Create summary table
    summary_data = results_df[['Model', 'Features', 'Test Accuracy', 'Test F1']].head(5)
    table = ax.table(cellText=summary_data.values,
                     colLabels=summary_data.columns,
                     cellLoc='center',
                     loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1.2, 1.5)
    ax.set_title('Top 5 Models Summary', pad=20)
    
    plt.tight_layout()
    plt.savefig('plots/ml_models_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    return results_df

# Compare all models
results_summary = compare_all_models(svm_results, rf_results, additional_results, y_test)

# Cell 12 - Detailed Analysis of Best Models
 

In [None]:
def detailed_model_analysis(model_results, model_name, X_test, y_test, feature_name='combined'):
    """Perform detailed analysis of a specific model"""
    
    print(f"\n{'='*60}")
    print(f"DETAILED ANALYSIS: {model_name}")
    print(f"{'='*60}")
    
    # Get model and predictions
    if 'SVM' in model_name:
        svm_type = model_name.split()[-1]
        model_info = svm_results[feature_name][svm_type]
    elif model_name == 'Random Forest':
        model_info = rf_results[feature_name]
    else:
        model_info = additional_results[model_name]
    
    model = model_info['model']
    y_pred = model_info['y_test_pred']
    
    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, 
                              target_names=[class_names[i] for i in range(num_classes)],
                              digits=4))
    
    # Per-class performance
    precision, recall, f1, support = precision_recall_fscore_support(
        y_test, y_pred, average=None
    )
    
    print("\nPer-Class Performance:")
    print(f"{'Class':<15} {'Precision':<10} {'Recall':<10} {'F1-Score':<10} {'Support':<10}")
    print("-" * 55)
    for i in range(num_classes):
        print(f"{class_names[i]:<15} {precision[i]:<10.4f} {recall[i]:<10.4f} "
              f"{f1[i]:<10.4f} {support[i]:<10}")
    
    # Error analysis
    errors = y_pred != y_test
    error_indices = np.where(errors)[0]
    
    print(f"\nError Analysis:")
    print(f"Total errors: {len(error_indices)} ({len(error_indices)/len(y_test)*100:.2f}%)")
    
    # Confusion patterns
    print("\nMost common confusion patterns:")
    confusion_patterns = {}
    for idx in error_indices:
        pattern = f"{class_names[y_test[idx]]} → {class_names[y_pred[idx]]}"
        confusion_patterns[pattern] = confusion_patterns.get(pattern, 0) + 1
    
    for pattern, count in sorted(confusion_patterns.items(), key=lambda x: x[1], reverse=True)[:5]:
        print(f"{pattern}: {count} times")
    
    # If model has predict_proba, analyze confidence
    if hasattr(model, 'predict_proba'):
        y_proba = model.predict_proba(X_test)
        confidence = np.max(y_proba, axis=1)
        
        print(f"\nConfidence Analysis:")
        print(f"Average confidence: {confidence.mean():.4f}")
        print(f"Confidence on correct predictions: {confidence[~errors].mean():.4f}")
        print(f"Confidence on errors: {confidence[errors].mean():.4f}")
        
        # Low confidence predictions
        low_conf_threshold = 0.5
        low_conf = confidence < low_conf_threshold
        print(f"\nPredictions with confidence < {low_conf_threshold}: "
              f"{low_conf.sum()} ({low_conf.sum()/len(y_test)*100:.2f}%)")

# Analyze top 2 models
top_models = results_summary.head(2)
for _, row in top_models.iterrows():
    model_name = row['Model']
    feature_name = row['Features']
    
    if 'SVM' in model_name and feature_name in feature_sets:
        X_test_model = feature_sets[feature_name][2]
    elif feature_name in feature_sets:
        X_test_model = feature_sets[feature_name][2]
    else:
        X_test_model = X_test_best
    
    detailed_model_analysis(None, model_name, X_test_model, y_test, feature_name)

# Cell 13 - Feature Analysis


In [None]:
def analyze_important_features(vectorizer, feature_importance=None, top_n=30):
    """Analyze most important features/words"""
    
    print(f"\n{'='*60}")
    print("FEATURE ANALYSIS")
    print(f"{'='*60}")
    
    # Get feature names
    feature_names = vectorizer.get_feature_names_out()
    
    if feature_importance is not None:
        # For tree-based models with feature importance
        indices = np.argsort(feature_importance)[-top_n:][::-1]
        
        print(f"\nTop {top_n} important features:")
        print(f"{'Rank':<6} {'Feature':<30} {'Importance':<10}")
        print("-" * 46)
        
        for i, idx in enumerate(indices):
            if idx < len(feature_names):
                print(f"{i+1:<6} {feature_names[idx]:<30} {feature_importance[idx]:<10.6f}")
    
    # Analyze features by class (for TF-IDF)
    if hasattr(vectorizer, 'idf_'):
        print(f"\n\nTop words by TF-IDF score:")
        
        # Get TF-IDF scores
        tfidf_scores = dict(zip(feature_names, vectorizer.idf_))
        sorted_words = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)
        
        print(f"{'Word':<30} {'TF-IDF Score':<10}")
        print("-" * 40)
        for word, score in sorted_words[:20]:
            print(f"{word:<30} {score:<10.4f}")

# Analyze features for best model
if 'Random Forest' in results_summary.iloc[0]['Model']:
    best_feature = results_summary.iloc[0]['Features']
    if best_feature in feature_extractor.vectorizers:
        analyze_important_features(
            feature_extractor.vectorizers['tfidf'],
            rf_results[best_feature]['feature_importance']
        )
else:
    # Analyze TF-IDF features
    if 'tfidf' in feature_extractor.vectorizers:
        analyze_important_features(feature_extractor.vectorizers['tfidf'])

# Cell 14 - Save Models and Results


In [None]:
def save_ml_models(best_models, vectorizers, preprocessor, scaler):
    """Save best ML models and preprocessing components"""
    
    # Create a directory for ML models
    ml_model_dir = 'models/ml_models'
    os.makedirs(ml_model_dir, exist_ok=True)
    
    # Save best model
    best_model_info = best_models.iloc[0]
    model_name = best_model_info['Model']
    feature_name = best_model_info['Features']
    
    # Get the actual model object
    if 'SVM' in model_name:
        svm_type = model_name.split()[-1]
        model = svm_results[feature_name][svm_type]['model']
    elif model_name == 'Random Forest':
        model = rf_results[feature_name]['model']
    else:
        model = additional_results[model_name]['model']
    
    # Save model
    model_filename = f"{ml_model_dir}/best_model_{model_name.lower().replace(' ', '_')}.pkl"
    with open(model_filename, 'wb') as f:
        pickle.dump(model, f)
    print(f"Saved model: {model_filename}")
    
    # Save vectorizer
    vectorizer = feature_extractor.vectorizers.get('tfidf')
    if vectorizer:
        with open(f"{ml_model_dir}/tfidf_vectorizer.pkl", 'wb') as f:
            pickle.dump(vectorizer, f)
        print(f"Saved vectorizer: {ml_model_dir}/tfidf_vectorizer.pkl")
    
    # Save preprocessor
    with open(f"{ml_model_dir}/preprocessor.pkl", 'wb') as f:
        pickle.dump(preprocessor, f)
    print(f"Saved preprocessor: {ml_model_dir}/preprocessor.pkl")
    
    # Save scaler
    with open(f"{ml_model_dir}/scaler.pkl", 'wb') as f:
        pickle.dump(scaler, f)
    print(f"Saved scaler: {ml_model_dir}/scaler.pkl")
    
    # Save results summary
    results_summary = {
        'best_model': {
            'name': model_name,
            'features': feature_name,
            'test_accuracy': float(best_model_info['Test Accuracy']),
            'test_f1': float(best_model_info['Test F1'])
        },
        'all_results': best_models.to_dict('records'),
        'dataset_info': {
            'num_samples': len(df),
            'num_classes': num_classes,
            'class_names': class_names
        },
        'feature_info': {
            'max_features': feature_extractor.max_features,
            'numerical_features': numerical_cols
        },
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }
    
    with open('results/ml_models_summary.json', 'w') as f:
        json.dump(results_summary, f, indent=4)
    print(f"\nSaved results summary: results/ml_models_summary.json")
    
    return results_summary

# Save models
print("\nSaving models and results...")
numerical_cols = ['char_count', 'word_count', 'unique_word_count', 'word_diversity',
                  'exclamation_count', 'question_count', 'punctuation_count', 'avg_word_length']
saved_summary = save_ml_models(results_summary, feature_extractor.vectorizers, preprocessor, scaler)

# Cell 15 - Inference Pipeline


In [None]:
class MLSentimentPredictor:
    """Inference pipeline for ML models"""
    
    def __init__(self, model_path, vectorizer_path, preprocessor_path, scaler_path):
        # Load components
        with open(model_path, 'rb') as f:
            self.model = pickle.load(f)
        
        with open(vectorizer_path, 'rb') as f:
            self.vectorizer = pickle.load(f)
        
        with open(preprocessor_path, 'rb') as f:
            self.preprocessor = pickle.load(f)
        
        with open(scaler_path, 'rb') as f:
            self.scaler = pickle.load(f)
        
        self.class_names = {0: 'Negative', 1: 'Positive', 2: 'Neutral'}
        self.num_classes = len(self.class_names)
    
    def extract_numerical_features(self, text):
        """Extract numerical features from text"""
        processed = self.preprocessor.preprocess(text)
        
        features = {
            'char_count': len(text),
            'word_count': len(processed.split()),
            'unique_word_count': len(set(processed.split())),
            'word_diversity': len(set(processed.split())) / (len(processed.split()) + 1),
            'exclamation_count': text.count('!'),
            'question_count': len(re.findall('[?।]', text)),
            'punctuation_count': sum(1 for c in text if c in '।!?,.-'),
            'avg_word_length': np.mean([len(word) for word in processed.split()]) if processed else 0
        }
        
        return np.array([features[col] for col in [
            'char_count', 'word_count', 'unique_word_count', 'word_diversity',
            'exclamation_count', 'question_count', 'punctuation_count', 'avg_word_length'
        ]])
    
    def predict(self, text):
        """Predict sentiment for a single text"""
        # Preprocess
        processed_text = self.preprocessor.preprocess(text)
        
        # Extract text features
        text_features = self.vectorizer.transform([processed_text])
        
        # Extract numerical features
        num_features = self.extract_numerical_features(text).reshape(1, -1)
        num_features_scaled = self.scaler.transform(num_features)
        
        # Combine features
        from scipy.sparse import hstack, csr_matrix
        combined_features = hstack([text_features, csr_matrix(num_features_scaled)])
        
        # Predict
        prediction = self.model.predict(combined_features)[0]
        
        # Get probabilities if available
        if hasattr(self.model, 'predict_proba'):
            probabilities = self.model.predict_proba(combined_features)[0]
            confidence = probabilities.max()
        else:
            probabilities = None
            confidence = None
        
        result = {
            'text': text,
            'sentiment': self.class_names[prediction],
            'label': prediction,
            'confidence': confidence
        }
        
        if probabilities is not None:
            result['probabilities'] = {
                self.class_names[i]: float(probabilities[i]) 
                for i in range(len(probabilities))
            }
        
        return result
    
    def predict_batch(self, texts):
        """Predict sentiments for multiple texts"""
        results = []
        for text in texts:
            results.append(self.predict(text))
        return results

# Initialize predictor
ml_predictor = MLSentimentPredictor(
    model_path='models/ml_models/best_model_svm_linearsvc.pkl',  # Update based on best model
    vectorizer_path='models/ml_models/tfidf_vectorizer.pkl',
    preprocessor_path='models/ml_models/preprocessor.pkl',
    scaler_path='models/ml_models/scaler.pkl'
)

# Test predictions
test_texts = [
    "এই পণ্যটি খুবই ভালো, আমি খুব সন্তুষ্ট।",
    "সার্ভিস একদম বাজে, কখনো কিনবেন না।",
    "মোটামুটি ঠিক আছে, দাম অনুযায়ী ভালো।",
    "অসাধারণ! আমার খুব পছন্দ হয়েছে।",
    "খুবই হতাশাজনক অভিজ্ঞতা।"
]

print("\nSENTIMENT PREDICTIONS (ML Model)")
print("="*70)

for text in test_texts:
    result = ml_predictor.predict(text)
    print(f"\nText: {result['text']}")
    print(f"Sentiment: {result['sentiment']}")
    if result['confidence']:
        print(f"Confidence: {result['confidence']:.4f}")
    if 'probabilities' in result:
        prob_str = ", ".join([f"{k}: {v:.3f}" for k, v in result['probabilities'].items()])
        print(f"Probabilities: {prob_str}")

# Cell 16 - Final Summary


In [None]:
print("\n" + "="*80)
print("MACHINE LEARNING MODELS - FINAL SUMMARY")
print("="*80)

# Display results
print(f"\nDataset: {len(df)} samples, {num_classes} classes")
print(f"Train/Val/Test split: {len(y_train)}/{len(y_val)}/{len(y_test)}")
print(f"\nBest performing models:")
print(results_summary.head(5).to_string(index=False))

# Compare with deep learning results (if available)
print("\n\nMODEL TYPE COMPARISON:")
print("-"*50)
print(f"Best ML Model: {results_summary.iloc[0]['Model']}")
print(f"  - Accuracy: {results_summary.iloc[0]['Test Accuracy']:.4f}")
print(f"  - F1 Score: {results_summary.iloc[0]['Test F1']:.4f}")

print("\n\nKEY INSIGHTS:")
print("-"*50)
print("1. Feature engineering is crucial for ML models")
print("2. TF-IDF with n-grams performs well for Bangla text")
print("3. Combining text and numerical features improves performance")
print("4. Class balancing helps with imbalanced datasets")
print("5. Simple models like LinearSVC can be very effective")

print("\n\nRECOMMENDATIONS:")
print("-"*50)
print("1. For production: Use LinearSVC or Logistic Regression (fast inference)")
print("2. For best accuracy: Use Random Forest or SVM with RBF kernel")
print("3. Consider ensemble methods combining multiple models")
print("4. Regular retraining with new data is important")