<a href="https://colab.research.google.com/github/Jeswin987/Emotion-classifier/blob/master/Emotion_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/Jeswin987/Emotion-classifier.git
%cd Emotion-classifier

Cloning into 'Emotion-classifier'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (6/6), 12.86 KiB | 4.29 MiB/s, done.
/content/Emotion-classifier


In [None]:
!mv /content/emotions (1).csv ./

/bin/bash: -c: line 1: syntax error near unexpected token `('
/bin/bash: -c: line 1: `mv /content/emotions (1).csv ./'


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import os
import joblib
import warnings
import smtplib
import json
import requests
from datetime import datetime
from collections import Counter
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart

# NLTK imports for text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Scikit-learn imports for machine learning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, silhouette_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, LatentDirichletAllocation
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Download required NLTK data
print("Downloading NLTK data...")
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
print("✅ NLTK data downloaded successfully!")

# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Emotion label mapping for better interpretation
label_map = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise'
}

# Global configuration for alerting
ALERT_CONFIG = {
    'email_enabled': False,
    'discord_enabled': False,
    'threshold_accuracy': 0.8,
    'emotion_thresholds': {
        'anger': 0.3,    # Alert if >30% anger
        'sadness': 0.4,  # Alert if >40% sadness
        'fear': 0.25     # Alert if >25% fear
    }
}

def upload_dataset():
    """
    Handle dataset upload with multiple fallback options

    Returns:
        str: Path to the dataset file
    """
    possible_files = ['emotions.csv', 'emotions (1).csv', '/content/emotions.csv', '/content/emotions (1).csv']

    for filepath in possible_files:
        if os.path.exists(filepath):
            print(f"✅ Found dataset: {filepath}")
            return filepath

    print("📁 Dataset not found. Please upload your emotions.csv file.")
    try:
        from google.colab import files
        uploaded = files.upload()
        filename = list(uploaded.keys())[0]

        if filename != 'emotions.csv':
            os.rename(filename, 'emotions.csv')
            print(f"✅ Renamed {filename} to emotions.csv")

        return 'emotions.csv'
    except Exception as e:
        print(f"❌ Error uploading file: {e}")
        return None

def load_and_explore_data(filepath):
    """
    Load dataset and perform comprehensive exploration

    Args:
        filepath (str): Path to the CSV file

    Returns:
        pd.DataFrame: Loaded and cleaned dataset
    """
    print("=" * 50)
    print("LOADING AND EXPLORING DATASET")
    print("=" * 50)

    try:
        encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']
        df = None
        for encoding in encodings:
            try:
                df = pd.read_csv(filepath, encoding=encoding)
                print(f"✅ Successfully loaded with {encoding} encoding")
                break
            except UnicodeDecodeError:
                continue
        if df is None:
            raise Exception("Could not decode file with any encoding")

        print(f"✅ Dataset loaded successfully!")
        print(f"📊 Dataset shape: {df.shape}")
        print(f"📋 Columns: {df.columns.tolist()}")

        required_columns = ['text', 'label']
        if not all(col in df.columns for col in required_columns):
            raise ValueError(f"Missing required columns. Found: {df.columns.tolist()}")

        print("\n🔍 First 5 rows:")
        print(df.head())

        print(f"\n⚠️ Missing values before cleaning:")
        missing_before = df.isnull().sum()
        print(missing_before)

        original_size = len(df)
        df.dropna(inplace=True)
        df.drop_duplicates(subset='text', inplace=True)

        print(f"\n🧹 Data cleaning summary:")
        print(f"   Original size: {original_size}")
        print(f"   After cleaning: {len(df)}")
        print(f"   Removed: {original_size - len(df)} rows")

        valid_labels = set(range(6))
        invalid_labels = set(df['label']) - valid_labels
        if invalid_labels:
            raise ValueError(f"Invalid labels found: {invalid_labels}")

        df['emotion'] = df['label'].map(label_map)

        print(f"\n📈 Class distribution:")
        class_dist = df['label'].value_counts().sort_index()
        for label, count in class_dist.items():
            emotion = label_map[label]
            percentage = (count / len(df)) * 100
            print(f"   {label} ({emotion}): {count} ({percentage:.1f}%)")

        print(f"\n📝 Sample texts by emotion:")
        for label in sorted(df['label'].unique()):
            emotion = label_map[label]
            sample_text = df[df['label'] == label]['text'].iloc[0]
            print(f"   {emotion.upper()}: \"{sample_text[:80]}...\"")

        return df

    except Exception as e:
        print(f"❌ Error loading dataset: {e}")
        return None

def preprocess_text(text):
    """
    Optimized text preprocessing function using stemming for speed

    Args:
        text (str): Raw text to preprocess

    Returns:
        str: Cleaned and preprocessed text
    """
    if not isinstance(text, str) or pd.isna(text):
        return ""

    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.strip()
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(tokens)

def prepare_features(df, test_size=0.2, max_features=10000, subset_size=None):
    """
    Prepare features using optimized TF-IDF vectorization

    Args:
        df (pd.DataFrame): Dataset with text and labels
        test_size (float): Proportion of data for testing
        max_features (int): Maximum number of TF-IDF features
        subset_size (int): Optional subset size for faster training

    Returns:
        tuple: Training and testing sets with TF-IDF features, indices, and subset DataFrame
    """
    print("=" * 50)
    print("FEATURE PREPARATION")
    print("=" * 50)

    print("🔄 Preprocessing text data...")
    df['cleaned_text'] = df['text'].apply(preprocess_text)

    df = df[df['cleaned_text'].str.len() > 0].reset_index(drop=True)
    if len(df) == 0:
        raise ValueError("No valid texts after preprocessing")

    print("\n📝 Preprocessing examples:")
    for i in range(min(3, len(df))):
        print(f"   Original: {df['text'].iloc[i][:60]}...")
        print(f"   Cleaned:  {df['cleaned_text'].iloc[i][:60]}...")
        print()

    X = df['cleaned_text']
    y = df['label']

    print("📊 Splitting data...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    print("🔢 Creating TF-IDF features...")
    tfidf = TfidfVectorizer(
        max_features=max_features,
        min_df=5,
        max_df=0.8,
        ngram_range=(1, 2),
        stop_words='english',
        sublinear_tf=True
    )

    X_train_tfidf = tfidf.fit_transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)

    train_indices = X_train.index
    if subset_size and subset_size < X_train_tfidf.shape[0]:
        print(f"⚡ Using subset of {subset_size} samples for faster training...")
        subset_indices = np.random.choice(X_train_tfidf.shape[0], subset_size, replace=False)
        X_train_tfidf = X_train_tfidf[subset_indices]
        y_train = y_train.iloc[subset_indices]
        train_indices = X_train.index[subset_indices]

    print(f"✅ Feature preparation complete!")
    print(f"   Training features shape: {X_train_tfidf.shape}")
    print(f"   Testing features shape: {X_test_tfidf.shape}")
    print(f"   Training samples: {len(y_train)}")
    print(f"   Testing samples: {len(y_test)}")

    return X_train_tfidf, X_test_tfidf, y_train, y_test, tfidf, df, train_indices

def evaluate_svm_kernel(kernel_type, X_train, X_test, y_train, y_test, show_confusion_matrix=True):
    """
    Train and evaluate SVM with specific kernel

    Args:
        kernel_type (str): Type of SVM kernel
        X_train, X_test: Training and testing features
        y_train, y_test: Training and testing labels
        show_confusion_matrix (bool): Whether to display confusion matrix

    Returns:
        dict: Model performance metrics
    """
    print(f"\n🚀 Training SVM with {kernel_type.upper()} kernel...")

    model_params = {
        'kernel': kernel_type,
        'random_state': 42,
        'probability': True
    }
    if kernel_type == 'poly':
        model_params['degree'] = 3
    elif kernel_type == 'rbf':
        model_params['gamma'] = 'scale'

    model = SVC(**model_params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    report_dict = classification_report(y_test, y_pred, output_dict=True)

    print(f"✅ {kernel_type.upper()} Kernel Results:")
    print(f"   Accuracy: {accuracy:.4f}")
    print(f"   Precision: {report_dict['weighted avg']['precision']:.4f}")
    print(f"   Recall: {report_dict['weighted avg']['recall']:.4f}")
    print(f"   F1-Score: {report_dict['weighted avg']['f1-score']:.4f}")

    print(f"\n📊 Detailed Classification Report ({kernel_type.upper()}):")
    emotion_names = [label_map[i] for i in sorted(label_map.keys())]
    print(classification_report(y_test, y_pred, target_names=emotion_names))

    if show_confusion_matrix:
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=emotion_names,
                    yticklabels=emotion_names)
        plt.title(f'Confusion Matrix - {kernel_type.upper()} Kernel', fontsize=14, fontweight='bold')
        plt.xlabel('Predicted Emotion', fontsize=12)
        plt.ylabel('Actual Emotion', fontsize=12)
        plt.tight_layout()
        plt.savefig(f'confusion_matrix_{kernel_type}.png')
        plt.close()
        print(f"📈 Confusion matrix saved as 'confusion_matrix_{kernel_type}.png'")

    return {
        'model': model,
        'accuracy': accuracy,
        'precision': report_dict['weighted avg']['precision'],
        'recall': report_dict['weighted avg']['recall'],
        'f1-score': report_dict['weighted avg']['f1-score'],
        'predictions': y_pred
    }

def perform_clustering_analysis(X_tfidf, df, train_indices, n_clusters=6):
    """
    Perform comprehensive clustering analysis on emotion data

    Args:
        X_tfidf: TF-IDF feature matrix
        df: Original dataframe with emotions
        train_indices: Indices of the training subset
        n_clusters: Number of clusters for K-means

    Returns:
        dict: Clustering results and analysis
    """
    print("=" * 60)
    print("CLUSTERING ANALYSIS")
    print("=" * 60)

    clustering_results = {}

    print("🔍 Performing K-Means clustering...")
    inertias = []
    silhouette_scores = []
    K_range = range(2, 11)

    print("📊 Finding optimal number of clusters...")
    for k in K_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(X_tfidf)
        inertias.append(kmeans.inertia_)
        if k > 1:
            silhouette_scores.append(silhouette_score(X_tfidf, kmeans.labels_))
        else:
            silhouette_scores.append(0)

    plt.figure(figsize=(15, 5))
    plt.subplot(1, 2, 1)
    plt.plot(K_range, inertias, 'bo-')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Inertia')
    plt.title('Elbow Method for Optimal k')
    plt.grid(True, alpha=0.3)

    plt.subplot(1, 2, 2)
    plt.plot(K_range, silhouette_scores, 'ro-')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Analysis')
    plt.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig('clustering_evaluation.png')
    plt.close()
    print("📈 Clustering evaluation plots saved as 'clustering_evaluation.png'")

    print(f"🎯 Using {n_clusters} clusters for detailed analysis...")
    kmeans_final = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    cluster_labels = kmeans_final.fit_predict(X_tfidf)

    df_clustered = df.loc[train_indices].copy()
    df_clustered['cluster'] = cluster_labels

    silhouette_avg = silhouette_score(X_tfidf, cluster_labels)
    print(f"✅ Silhouette Score: {silhouette_avg:.4f}")

    print("\n📋 Cluster composition analysis:")
    cluster_emotion_dist = pd.crosstab(df_clustered['cluster'], df_clustered['emotion'])
    print(cluster_emotion_dist)

    plt.figure(figsize=(12, 8))
    sns.heatmap(cluster_emotion_dist, annot=True, fmt='d', cmap='YlOrRd')
    plt.title('Cluster vs Emotion Distribution Heatmap', fontsize=14, fontweight='bold')
    plt.xlabel('Emotion')
    plt.ylabel('Cluster')
    plt.tight_layout()
    plt.savefig('cluster_emotion_heatmap.png')
    plt.close()
    print("📈 Cluster-emotion heatmap saved as 'cluster_emotion_heatmap.png'")

    print("\n🎨 Creating cluster visualizations...")
    pca = PCA(n_components=2, random_state=42)
    X_pca = pca.fit_transform(X_tfidf.toarray())

    print("   Computing t-SNE (this may take a moment)...")
    subset_size = min(5000, X_tfidf.shape[0])
    indices = np.random.choice(X_tfidf.shape[0], subset_size, replace=False)
    X_tfidf_subset = X_tfidf[indices]
    cluster_labels_subset = cluster_labels[indices]
    emotion_labels_subset = df_clustered['label'].iloc[indices]

    tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
    X_tsne = tsne.fit_transform(X_tfidf_subset.toarray())

    plt.figure(figsize=(16, 12))
    plt.subplot(2, 2, 1)
    scatter1 = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis', alpha=0.6)
    plt.title('PCA - Clusters', fontsize=12, fontweight='bold')
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
    plt.colorbar(scatter1)

    plt.subplot(2, 2, 2)
    scatter2 = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df_clustered['label'], cmap='tab10', alpha=0.6)
    plt.title('PCA - Emotions', fontsize=12, fontweight='bold')
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
    plt.colorbar(scatter2)

    plt.subplot(2, 2, 3)
    scatter3 = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=cluster_labels_subset, cmap='viridis', alpha=0.6)
    plt.title('t-SNE - Clusters', fontsize=12, fontweight='bold')
    plt.xlabel('t-SNE 1')
    plt.ylabel('t-SNE 2')
    plt.colorbar(scatter3)

    plt.subplot(2, 2, 4)
    scatter4 = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=emotion_labels_subset, cmap='tab10', alpha=0.6)
    plt.title('t-SNE - Emotions', fontsize=12, fontweight='bold')
    plt.xlabel('t-SNE 1')
    plt.ylabel('t-SNE 2')
    plt.colorbar(scatter4)

    plt.tight_layout()
    plt.savefig('dimensionality_reduction_visualizations.png')
    plt.close()
    print("📈 Dimensionality reduction visualizations saved as 'dimensionality_reduction_visualizations.png'")

    print("\n🔍 Detailed cluster characteristics:")
    for cluster_id in range(n_clusters):
        cluster_data = df_clustered[df_clustered['cluster'] == cluster_id]
        dominant_emotion = cluster_data['emotion'].mode().iloc[0] if not cluster_data.empty else "None"
        emotion_dist = cluster_data['emotion'].value_counts()

        print(f"\n   📌 Cluster {cluster_id}:")
        print(f"      Size: {len(cluster_data)} tweets")
        print(f"      Dominant emotion: {dominant_emotion}")
        print(f"      Emotion distribution: {dict(emotion_dist)}")

        sample_tweets = cluster_data['text'].head(2).tolist()
        for i, tweet in enumerate(sample_tweets):
            print(f"      Sample {i+1}: \"{tweet[:80]}...\"")

    print("\n📚 Performing topic modeling...")
    try:
        from sklearn.feature_extraction.text import CountVectorizer
        vectorizer = CountVectorizer(max_features=1000, stop_words='english', min_df=5)
        X_counts = vectorizer.fit_transform(df_clustered['cleaned_text'])
        lda = LatentDirichletAllocation(n_components=n_clusters, random_state=42, max_iter=10)
        lda.fit(X_counts)
        feature_names = vectorizer.get_feature_names_out()

        print("\n🏷️ Topics discovered:")
        for topic_idx, topic in enumerate(lda.components_):
            top_words = [feature_names[i] for i in topic.argsort()[-10:][::-1]]
            print(f"   Topic {topic_idx}: {', '.join(top_words)}")

        clustering_results['topics'] = {
            'model': lda,
            'vectorizer': vectorizer,
            'feature_names': feature_names
        }
    except Exception as e:
        print(f"   ⚠️ Topic modeling failed: {e}")

    clustering_results.update({
        'kmeans_model': kmeans_final,
        'cluster_labels': cluster_labels,
        'silhouette_score': silhouette_avg,
        'cluster_emotion_dist': cluster_emotion_dist,
        'pca_model': pca,
        'tsne_model': tsne,
        'clustered_data': df_clustered
    })

    print("\n✅ Clustering analysis complete!")
    return clustering_results

def setup_alerting_system():
    """
    Configure alerting system for monitoring emotion patterns

    Returns:
        dict: Alerting configuration
    """
    print("=" * 50)
    print("ALERTING SYSTEM SETUP")
    print("=" * 50)

    alert_config = {
        'enabled': True,
        'email_alerts': {
            'enabled': False,
            'smtp_server': 'smtp.gmail.com',
            'smtp_port': 587,
            'sender_email': 'your-email@gmail.com',
            'sender_password': 'your-app-password',
            'recipient_email': 'recipient@gmail.com'
        },
        'discord_alerts': {
            'enabled': False,
            'webhook_url': 'https://discord.com/api/webhooks/YOUR_WEBHOOK_URL'
        },
        'console_alerts': {
            'enabled': True
        },
        'thresholds': {
            'accuracy_min': 0.75,
            'anger_max': 0.30,
            'sadness_max': 0.40,
            'fear_max': 0.25,
            'negative_emotions_max': 0.60
        }
    }

    print("✅ Alerting system configured!")
    print("📋 Alert thresholds:")
    for threshold, value in alert_config['thresholds'].items():
        print(f"   {threshold}: {value}")

    return alert_config

def send_alert(alert_type, message, details, alert_config):
    """
    Send alerts through configured channels

    Args:
        alert_type (str): Type of alert (accuracy, emotion_threshold, etc.)
        message (str): Alert message
        details (dict): Additional details
        alert_config (dict): Alerting configuration
    """
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    if alert_config['console_alerts']['enabled']:
        print("\n" + "🚨" * 20)
        print(f"ALERT: {alert_type.upper()}")
        print(f"Time: {timestamp}")
        print(f"Message: {message}")
        if details:
            print("Details:")
            for key, value in details.items():
                print(f"  {key}: {value}")
        print("🚨" * 20)

    if alert_config['email_alerts']['enabled']:
        try:
            send_email_alert(alert_type, message, details, alert_config['email_alerts'], timestamp)
        except Exception as e:
            print(f"⚠️ Email alert failed: {e}")

    if alert_config['discord_alerts']['enabled']:
        try:
            send_discord_alert(alert_type, message, details, alert_config['discord_alerts'], timestamp)
        except Exception as e:
            print(f"⚠️ Discord alert failed: {e}")

def send_email_alert(alert_type, message, details, email_config, timestamp):
    """Send email alert"""
    subject = f"Emotion Analysis Alert: {alert_type}"
    body = f"""
    Emotion Analysis Alert

    Type: {alert_type}
    Time: {timestamp}
    Message: {message}

    Details:
    """
    for key, value in details.items():
        body += f"  {key}: {value}\n"

    msg = MIMEMultipart()
    msg['From'] = email_config['sender_email']
    msg['To'] = email_config['recipient_email']
    msg['Subject'] = subject
    msg.attach(MIMEText(body, 'plain'))

    server = smtplib.SMTP(email_config['smtp_server'], email_config['smtp_port'])
    server.starttls()
    server.login(email_config['sender_email'], email_config['sender_password'])
    server.send_message(msg)
    server.quit()

def send_discord_alert(alert_type, message, details, discord_config, timestamp):
    """Send Discord webhook alert"""
    webhook_data = {
        "embeds": [{
            "title": f"🚨 Emotion Analysis Alert: {alert_type}",
            "description": message,
            "color": 16711680,
            "timestamp": datetime.now().isoformat(),
            "fields": [{"name": key, "value": str(value), "inline": True} for key, value in details.items()]
        }]
    }

    response = requests.post(discord_config['webhook_url'], json=webhook_data)
    response.raise_for_status()

def monitor_emotion_patterns(results, clustering_results, alert_config):
    """
    Monitor emotion patterns and trigger alerts if thresholds are exceeded

    Args:
        results (dict): SVM classification results
        clustering_results (dict): Clustering analysis results
        alert_config (dict): Alerting configuration
    """
    print("=" * 50)
    print("EMOTION PATTERN MONITORING")
    print("=" * 50)

    best_accuracy = max([results[kernel]['accuracy'] for kernel in results.keys()])

    if best_accuracy < alert_config['thresholds']['accuracy_min']:
        send_alert(
            'low_accuracy',
            f'Model accuracy ({best_accuracy:.4f}) is below threshold ({alert_config["thresholds"]["accuracy_min"]})',
            {'accuracy': best_accuracy, 'threshold': alert_config['thresholds']['accuracy_min']},
            alert_config
        )

    df_clustered = clustering_results['clustered_data']
    emotion_dist = df_clustered['emotion'].value_counts(normalize=True)

    print("📊 Current emotion distribution:")
    for emotion, percentage in emotion_dist.items():
        print(f"   {emotion}: {percentage:.2%}")

    negative_emotions = ['anger', 'sadness', 'fear']
    for emotion in negative_emotions:
        if emotion in emotion_dist:
            percentage = emotion_dist[emotion]
            threshold_key = f'{emotion}_max'
            if threshold_key in alert_config['thresholds']:
                threshold = alert_config['thresholds'][threshold_key]
                if percentage > threshold:
                    send_alert(
                        f'high_{emotion}',
                        f'High {emotion} detected: {percentage:.2%} (threshold: {threshold:.2%})',
                        {
                            'emotion': emotion,
                            'percentage': f'{percentage:.2%}',
                            'threshold': f'{threshold:.2%}',
                            'sample_count': int(percentage * len(df_clustered))
                        },
                        alert_config
                    )

    negative_percentage = sum([emotion_dist.get(emotion, 0) for emotion in negative_emotions])
    if negative_percentage > alert_config['thresholds']['negative_emotions_max']:
        send_alert(
            'high_negative_emotions',
            f'High negative emotions detected: {negative_percentage:.2%}',
            {
                'negative_percentage': f'{negative_percentage:.2%}',
                'threshold': f'{alert_config["thresholds"]["negative_emotions_max"]:.2%}',
                'breakdown': {emotion: f'{emotion_dist.get(emotion, 0):.2%}' for emotion in negative_emotions}
            },
            alert_config
        )

def compare_kernels(X_train, X_test, y_train, y_test):
    """
    Compare performance of different SVM kernels

    Args:
        X_train, X_test: Training and testing features
        y_train, y_test: Training and testing labels

    Returns:
        tuple: Results dictionary and best kernel name
    """
    print("=" * 50)
    print("SVM KERNEL COMPARISON")
    print("=" * 50)

    kernels = ['linear', 'poly', 'rbf', 'sigmoid']
    results = {}

    for kernel in kernels:
        results[kernel] = evaluate_svm_kernel(kernel, X_train, X_test, y_train, y_test)

    comparison_data = {
        kernel: {
            'Accuracy': results[kernel]['accuracy'],
            'Precision': results[kernel]['precision'],
            'Recall': results[kernel]['recall'],
            'F1-Score': results[kernel]['f1-score']
        }
        for kernel in kernels
    }

    results_df = pd.DataFrame(comparison_data).T

    print("\n" + "=" * 50)
    print("PERFORMANCE COMPARISON SUMMARY")
    print("=" * 50)
    print(results_df.round(4))

    plt.figure(figsize=(10, 6))
    bars = plt.bar(kernels, [results[k]['accuracy'] for k in kernels],
                   color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
    plt.title('SVM Kernel Accuracy Comparison', fontsize=14, fontweight='bold')
    plt.xlabel('Kernel Type', fontsize=12)
    plt.ylabel('Accuracy', fontsize=12)
    plt.ylim(0, 1)

    for bar, accuracy in zip(bars, [results[k]['accuracy'] for k in kernels]):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{accuracy:.4f}', ha='center', va='bottom', fontweight='bold')

    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.savefig('kernel_accuracy_comparison.png')
    plt.close()
    print("\n📈 Accuracy comparison plot saved as 'kernel_accuracy_comparison.png'")

    best_kernel = results_df['Accuracy'].idxmax()
    best_accuracy = results_df.loc[best_kernel, 'Accuracy']

    print("\n" + "🏆" * 50)
    print("BEST MODEL IDENTIFICATION")
    print("🏆" * 50)
    print(f"🥇 Best performing kernel: {best_kernel.upper()}")
    print(f"🎯 Best accuracy: {best_accuracy:.4f}")
    print(f"📊 Full metrics for best model:")
    for metric, value in results_df.loc[best_kernel].items():
        print(f"   {metric}: {value:.4f}")

    return results, best_kernel

def save_and_download_models(best_model, tfidf_vectorizer, kmeans_model, best_kernel):
    """
    Save trained models and download them

    Args:
        best_model: Trained SVM model
        tfidf_vectorizer: Fitted TF-IDF vectorizer
        kmeans_model: Trained K-means model
        best_kernel (str): Name of best performing kernel
    """
    print("\n" + "=" * 50)
    print("MODEL SAVING AND DOWNLOAD")
    print("=" * 50)

    try:
        model_filename = f'svm_emotion_model_{best_kernel}.pkl'
        joblib.dump(best_model, model_filename)
        joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
        joblib.dump(kmeans_model, 'kmeans_model.pkl')
        joblib.dump(label_map, 'label_mapping.pkl')

        print("✅ Models saved successfully!")
        print(f"   📁 {model_filename}")
        print(f"   📁 tfidf_vectorizer.pkl")
        print(f"   📁 kmeans_model.pkl")
        print(f"   📁 label_mapping.pkl")

        try:
            from google.colab import files
            files.download(model_filename)
            files.download('tfidf_vectorizer.pkl')
            files.download('kmeans_model.pkl')
            files.download('label_mapping.pkl')
            print("📥 Files downloaded successfully!")
        except:
            print("⚠️ Download not available (not in Colab environment)")

    except Exception as e:
        print(f"❌ Error saving models: {e}")

def create_prediction_function(model, vectorizer):
    """
    Create a function for predicting emotions from new text

    Args:
        model: Trained SVM model
        vectorizer: Fitted TF-IDF vectorizer

    Returns:
        function: Prediction function
    """
    def predict_emotion(text):
        """
        Predict emotion for given text

        Args:
            text (str): Input text

        Returns:
            str: Predicted emotion
        """
        if not isinstance(text, str) or not text.strip():
            return "Invalid input"

        cleaned_text = preprocess_text(text)
        if not cleaned_text:
            return "Empty after preprocessing"

        text_vector = vectorizer.transform([cleaned_text])
        pred_label = model.predict(text_vector)[0]
        emotion = label_map.get(pred_label, "Unknown")

        return emotion

    return predict_emotion

def main():
    """
    Main function executing the complete emotion analysis pipeline
    """
    print("🚀" * 20)
    print("TWEET EMOTION ANALYSIS PIPELINE")
    print("🚀" * 20)

    try:
        alert_config = setup_alerting_system()

        filepath = upload_dataset()
        if not filepath:
            print("❌ Cannot proceed without dataset")
            return None, None, None, None

        df = load_and_explore_data(filepath)
        if df is None:
            print("❌ Failed to load dataset")
            return None, None, None, None

        X_train, X_test, y_train, y_test, tfidf, df, train_indices = prepare_features(
            df, test_size=0.2, max_features=10000, subset_size=10000
        )

        results, best_kernel = compare_kernels(X_train, X_test, y_train, y_test)

        print("\n" + "🏗️" * 50)
        print("TRAINING FINAL SVM MODEL")
        print("🏗️" * 50)

        model_params = {
            'kernel': best_kernel,
            'probability': True,
            'random_state': 42
        }
        if best_kernel == 'poly':
            model_params['degree'] = 3
        elif best_kernel == 'rbf':
            model_params['gamma'] = 'scale'

        final_model = SVC(**model_params)
        final_model.fit(X_train, y_train)

        print(f"✅ Final SVM model trained with {best_kernel.upper()} kernel")

        clustering_results = perform_clustering_analysis(X_train, df, train_indices, n_clusters=6)

        monitor_emotion_patterns(results, clustering_results, alert_config)

        save_and_download_models(final_model, tfidf, clustering_results['kmeans_model'], best_kernel)

        predict_emotion = create_prediction_function(final_model, tfidf)

        print("\n" + "🎯" * 50)
        print("INTERACTIVE EMOTION PREDICTION")
        print("🎯" * 50)

        sample_texts = [
            "I'm so happy today!",
            "This makes me really angry",
            "I'm scared of what might happen",
            "I love spending time with my family",
            "I feel so sad and lonely",
            "What a surprise that was!"
        ]

        print("🔍 Testing with sample texts:")
        for text in sample_texts:
            emotion = predict_emotion(text)
            print(f"   Text: \"{text}\"")
            print(f"   Predicted Emotion: {emotion.upper()}")
            print()

        print("🎮 Try your own text (press Enter to skip):")
        try:
            user_input = input("Enter text to predict emotion: ").strip()
            if user_input:
                predicted_emotion = predict_emotion(user_input)
                print(f"\n🎯 Predicted Emotion for \"{user_input}\": {predicted_emotion.upper()}")
        except:
            print("⚠️ Interactive input not available in this environment")

        print("\n" + "🎉" * 50)
        print("EMOTION ANALYSIS PIPELINE COMPLETE!")
        print("🎉" * 50)

        return results, clustering_results, tfidf, predict_emotion

    except Exception as e:
        print(f"❌ An error occurred: {str(e)}")
        return None, None, None, None

if __name__ == "__main__":
    results, clustering_results, vectorizer, predictor = main()

Downloading NLTK data...
✅ NLTK data downloaded successfully!
🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀
TWEET EMOTION ANALYSIS PIPELINE
🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀
ALERTING SYSTEM SETUP
✅ Alerting system configured!
📋 Alert thresholds:
   accuracy_min: 0.75
   anger_max: 0.3
   sadness_max: 0.4
   fear_max: 0.25
   negative_emotions_max: 0.6
✅ Found dataset: emotions (1).csv
LOADING AND EXPLORING DATASET
✅ Successfully loaded with utf-8 encoding
✅ Dataset loaded successfully!
📊 Dataset shape: (416809, 2)
📋 Columns: ['text', 'label']

🔍 First 5 rows:
                                                text  label
0      i just feel really helpless and heavy hearted      4
1  ive enjoyed being able to slouch about relax a...      0
2  i gave up my internship with the dmrg and am f...      4
3                         i dont know i feel so lost      0
4  i am a kindergarten teacher and i am thoroughl...      4

⚠️ Missing values before cleaning:
text     0
label    0
dtype: int64

🧹 Data cleaning summary:
   Original siz

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📥 Files downloaded successfully!

🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯
INTERACTIVE EMOTION PREDICTION
🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯
🔍 Testing with sample texts:
   Text: "I'm so happy today!"
   Predicted Emotion: JOY

   Text: "This makes me really angry"
   Predicted Emotion: ANGER

   Text: "I'm scared of what might happen"
   Predicted Emotion: FEAR

   Text: "I love spending time with my family"
   Predicted Emotion: JOY

   Text: "I feel so sad and lonely"
   Predicted Emotion: SADNESS

   Text: "What a surprise that was!"
   Predicted Emotion: SURPRISE

🎮 Try your own text (press Enter to skip):
Enter text to predict emotion: Graduating feels amazing, but I'm going to miss all my college friends so much

🎯 Predicted Emotion for "Graduating feels amazing, but I'm going to miss all my college friends so much": SURPRISE

🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉
EMOTION ANALYSIS PIPELINE COMPLETE!
🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉