In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
from transformers import pipeline
from deep_translator import GoogleTranslator
import warnings

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
2024-12-18 19:34:20.354551: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-18 19:34:20.441296: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-18 19:34:20.441362: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-18 19:34:20.441448: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-18 19:34:20.458648: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU

In [7]:
# Ensure results directory exists
os.makedirs("results/plots", exist_ok=True)
goemotions_test_data = "GoEmotions"

In [16]:
def load_goemotions_data(data_dir):
    """
    Load GoEmotions dataset from CSV files.
    :param data_dir: Path to directory containing GoEmotions CSV files
    :return: DataFrame with text and labels
    """
    import pandas as pd
    files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.csv')]
    dataframes = [pd.read_csv(file) for file in files]
    data = pd.concat(dataframes, ignore_index=True)
    
    # Map to the relevant emotion columns
    emotion_columns = ['anger', 'joy', 'sadness', 'fear', 'disgust', 'surprise', 'neutral']
    data = data[['text'] + emotion_columns]
    print(f"Loaded {len(data)} samples from GoEmotions dataset.")
    return data


def validate_roberta_goemotions(data, sample_size=100):
    """
    Validate RoBERTa model on GoEmotions text data.
    :param data: DataFrame with 'text' and emotion columns
    :param sample_size: Number of samples to process
    :return: Ground truth and predicted labels
    """
    ground_truth = []
    predictions = []
    
    classifier = pipeline("text-classification", model="j-hartmann/emotion-english-roberta-large")
    translator = GoogleTranslator(source='ru', target='en')
    
    emotion_columns = ['anger', 'happy', 'sad', 'fear', 'disgust', 'surprise', 'neutral']
    sampled_data = data.sample(sample_size, random_state=42)
    print(f"Validating RoBERTa on {len(sampled_data)} samples...")
    
    for _, row in tqdm(sampled_data.iterrows(), total=sample_size):
        text = row['text']
        label = max(emotion_columns, key=lambda x: row[x])  # Select the primary emotion
        ground_truth.append(label)
        
        try:
            # Translate to English if necessary
            english_text = translator.translate(text)
            # Classify emotion
            result = classifier(english_text)
            predicted_label = result[0]['label'].lower()
        except Exception as e:
            print(f"Error processing text: {e}")
            predicted_label = "error"
        predictions.append(predicted_label)
    
    return ground_truth, predictions


In [17]:
def visualize_text_results(ground_truth, predictions, save_dir, filename):
    """
    Generate confusion matrix and classification report for text data.
    :param ground_truth: List of ground truth labels
    :param predictions: List of predicted labels
    :param save_dir: Directory to save plots
    :param filename: Name of the saved plot
    """
    labels = ['anger', 'happy', 'sad', 'fear', 'disgust', 'surprise', 'neutral']
    cm = confusion_matrix(ground_truth, predictions, labels=labels)
    print("\nClassification Report:")
    print(classification_report(ground_truth, predictions, labels=labels))
    
    # Save confusion matrix plot
    cm_path = os.path.join(save_dir, filename)
    plot_confusion_matrix(cm, labels, filename.split(".")[0], cm_path)
    print(f"Confusion matrix saved at: {cm_path}")

def plot_confusion_matrix(cm, labels, title, save_path):
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.title(title)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(save_path)
    plt.close()

In [18]:
# ## Section 8: Run GoEmotions Validation
print("\nStarting GoEmotions validation with RoBERTa...")
goemotions_data = load_goemotions_data(goemotions_test_data)
text_ground_truth, text_predictions = validate_roberta_goemotions(goemotions_data, sample_size=100)
visualize_text_results(text_ground_truth, text_predictions, save_dir="results/plots", filename="goemotions_confusion_matrix.png")

print("Validation complete. Results saved.")


Starting GoEmotions validation with RoBERTa...


KeyError: "['happy', 'sad'] not in index"