# MUST-IN: Multilingual Hate Speech Detection Analysis

This notebook provides comprehensive analysis and visualization of the MUST-IN framework results.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

## 1. Data Exploration

In [None]:
# Load raw dataset
df_raw = pd.read_csv('data/raw/sample_dataset.csv')
print(f"Total records: {len(df_raw)}")
print(f"\nColumns: {df_raw.columns.tolist()}")
df_raw.head()

In [None]:
# Dataset statistics
print("Dataset Statistics:")
print("=" * 50)
print(f"\nLabel Distribution:")
print(df_raw['label'].value_counts())
print(f"\nLanguage Distribution:")
print(df_raw['language'].value_counts())
print(f"\nPlatform Distribution:")
print(df_raw['platform'].value_counts())

In [None]:
# Visualize label distribution
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Label distribution
df_raw['label'].value_counts().plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title('Label Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Label')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

# Language distribution
df_raw['language'].value_counts().plot(kind='bar', ax=axes[1], color='lightcoral')
axes[1].set_title('Language Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Language')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=45)

# Platform distribution
df_raw['platform'].value_counts().plot(kind='bar', ax=axes[2], color='lightgreen')
axes[2].set_title('Platform Distribution', fontsize=14, fontweight='bold')
axes[2].set_xlabel('Platform')
axes[2].set_ylabel('Count')
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('results/data_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Cross-tabulation: Language vs Label
crosstab = pd.crosstab(df_raw['language'], df_raw['label'])
print("\nLanguage vs Label Distribution:")
print(crosstab)

# Visualize heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(crosstab, annot=True, fmt='d', cmap='YlOrRd', cbar_kws={'label': 'Count'})
plt.title('Language vs Label Heatmap', fontsize=14, fontweight='bold')
plt.xlabel('Label')
plt.ylabel('Language')
plt.tight_layout()
plt.savefig('results/language_label_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

## 2. Text Analysis

In [None]:
# Text length analysis
df_raw['text_length'] = df_raw['text'].str.len()
df_raw['word_count'] = df_raw['text'].str.split().str.len()

print("Text Length Statistics:")
print(df_raw.groupby('label')[['text_length', 'word_count']].describe())

In [None]:
# Visualize text length by label
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Character length
df_raw.boxplot(column='text_length', by='label', ax=axes[0])
axes[0].set_title('Text Length by Label', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Label')
axes[0].set_ylabel('Character Length')
plt.sca(axes[0])
plt.xticks(rotation=45)

# Word count
df_raw.boxplot(column='word_count', by='label', ax=axes[1])
axes[1].set_title('Word Count by Label', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Label')
axes[1].set_ylabel('Word Count')
plt.sca(axes[1])
plt.xticks(rotation=45)

plt.tight_layout()
plt.savefig('results/text_length_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Preprocessing Comparison

In [None]:
# Load cleaned dataset
df_clean = pd.read_csv('data/processed/dataset_cleaned.csv')

# Compare original vs cleaned
comparison_df = pd.DataFrame({
    'Original': df_raw['text'].head(10),
    'Cleaned': df_clean['clean_text'].head(10)
})
print("\nOriginal vs Cleaned Text (First 10 samples):")
print("=" * 80)
for idx, row in comparison_df.iterrows():
    print(f"\n{idx+1}.")
    print(f"  Original: {row['Original']}")
    print(f"  Cleaned:  {row['Cleaned']}")

## 4. Model Performance Comparison

In [None]:
# Note: This section requires running main.py first to generate confusion matrices
import os
from PIL import Image

confusion_matrices = [f for f in os.listdir('results') if f.endswith('_confusion_matrix.png')]

if len(confusion_matrices) > 0:
    print(f"Found {len(confusion_matrices)} confusion matrix visualizations")
    
    # Display a few confusion matrices
    num_to_display = min(4, len(confusion_matrices))
    fig, axes = plt.subplots(2, 2, figsize=(14, 12))
    axes = axes.flatten()
    
    for i, cm_file in enumerate(confusion_matrices[:num_to_display]):
        img = Image.open(f'results/{cm_file}')
        axes[i].imshow(img)
        axes[i].axis('off')
        axes[i].set_title(cm_file.replace('_confusion_matrix.png', ''), fontsize=10)
    
    plt.tight_layout()
    plt.savefig('results/confusion_matrices_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("No confusion matrices found. Please run main.py first.")

## 5. Model Metrics Summary

In [None]:
# Create a summary table of model performances
# Note: These values would ideally be loaded from saved results
# For now, this is a template for displaying results

models_summary = {
    'Model': ['MNB_Count', 'MNB_TFIDF', 'GNB_Count', 'GNB_TFIDF', 
              'SVM_Count', 'SVM_TFIDF', 'RF_Count', 'RF_TFIDF'],
    'Vectorizer': ['Count', 'TF-IDF', 'Count', 'TF-IDF', 
                   'Count', 'TF-IDF', 'Count', 'TF-IDF'],
    'Classifier': ['Multinomial NB', 'Multinomial NB', 'Gaussian NB', 'Gaussian NB',
                   'SVM (Linear)', 'SVM (Linear)', 'Random Forest', 'Random Forest']
}

results_df = pd.DataFrame(models_summary)
print("\nModel Configurations:")
print(results_df.to_string(index=False))

## 6. LIME Explainability Analysis

In [None]:
# Check if LIME explanation exists
lime_file = 'results/lime_explanation_traditional.html'
if os.path.exists(lime_file):
    print(f"LIME explanation available at: {lime_file}")
    print("\nOpen this file in a web browser to view the explanation.")
    print("\nLIME provides:")
    print("  - Feature importance for the prediction")
    print("  - Words that contributed most to the classification")
    print("  - Probability scores for each class")
else:
    print("LIME explanation not found. Please run main.py first.")

## 7. Recommendations and Next Steps

In [None]:
print("\n" + "="*70)
print("PROJECT COMPLETION RECOMMENDATIONS")
print("="*70)
print("\n1. DATA ENHANCEMENT:")
print("   - Collect more diverse hate speech examples")
print("   - Balance the dataset across all three labels")
print("   - Include more Romanized script variations")

print("\n2. MODEL IMPROVEMENTS:")
print("   - Enable deep learning models (set RUN_DL=True in main.py)")
print("   - Experiment with XLM-RoBERTa for better multilingual support")
print("   - Implement cross-validation for robust evaluation")

print("\n3. PREPROCESSING ENHANCEMENTS:")
print("   - Expand transliteration dictionaries")
print("   - Add language-specific stopword removal")
print("   - Implement advanced emoji handling")

print("\n4. EVALUATION EXTENSIONS:")
print("   - Add per-language performance metrics")
print("   - Implement error analysis pipeline")
print("   - Create comparative visualization dashboard")

print("\n5. DEPLOYMENT CONSIDERATIONS:")
print("   - Create REST API for model serving")
print("   - Implement model versioning")
print("   - Add monitoring and logging capabilities")
print("\n" + "="*70)