In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
import numpy as np
from sklearn.model_selection import train_test_split


def get_custom_stopwords():
    """
    Get custom stopwords list for political content analysis.

    Returns:
        list: Extended stopwords list with domain-specific terms
    """
    return list(
        set(TfidfVectorizer(stop_words='english').get_stop_words()).union({
            'said', 'says', 'mr', 'mrs', 'would', 'also', 'one', 'u', 'us'
        })
    )


def preprocess_text_data(df, content_column='content', label_column='bias', max_features=5000):
    """
    Preprocess text data with TF-IDF vectorization and custom stopwords.

    Args:
        df (pandas.DataFrame): Input dataframe with text and labels
        content_column (str): Column name containing text content
        label_column (str): Column name containing labels
        max_features (int): Maximum number of features for TF-IDF

    Returns:
        tuple: (X, y, vectorizer) where X is the TF-IDF matrix, y is the label array,
               and vectorizer is the fitted TfidfVectorizer
    """
    custom_stop_words = get_custom_stopwords()
    vectorizer = TfidfVectorizer(stop_words=custom_stop_words, max_features=max_features)
    X = vectorizer.fit_transform(df[content_column])
    y = df[label_column]
    return X, y, vectorizer


def apply_smote(X, y, random_state=42):
    """
    Apply SMOTE to balance classes.

    Args:
        X: Feature matrix
        y: Target labels
        random_state (int): Random seed for reproducibility

    Returns:
        tuple: (X_res, y_res) balanced dataset
    """
    smote = SMOTE(random_state=random_state)
    X_res, y_res = smote.fit_resample(X, y)
    return X_res, y_res


def prepare_train_test_data(X, y, test_size=0.2, random_state=42):
    """
    Split data into training and testing sets.

    Args:
        X: Feature matrix
        y: Target labels
        test_size (float): Proportion of data to use for testing
        random_state (int): Random seed for reproducibility

    Returns:
        tuple: (X_train, X_test, y_train, y_test) split datasets
    """
    return train_test_split(X, y, test_size=test_size, random_state=random_state)


def load_and_prepare_data(dataset_name="siddharthmb/article-bias-prediction-random-splits",
                         split="train", apply_balancing=True, max_features=5000):
    """
    Load dataset, preprocess text, and optionally apply SMOTE balancing.

    Args:
        dataset_name (str): HuggingFace dataset name
        split (str): Dataset split to use ('train', 'valid', 'test')
        apply_balancing (bool): Whether to apply SMOTE for class balancing
        max_features (int): Maximum number of TF-IDF features

    Returns:
        tuple: (X, y, vectorizer) processed data
    """
    from datasets import load_dataset

    # Load dataset
    ds = load_dataset(dataset_name)
    df = pd.DataFrame(ds[split])

    # Preprocess with TF-IDF
    X, y, vectorizer = preprocess_text_data(df, max_features=max_features)

    # Apply SMOTE if requested
    if apply_balancing:
        X, y = apply_smote(X, y)

    return X, y, vectorizer


def save_processed_data(X, y, vectorizer, output_dir="data/processed", prefix="tfidf"):
    """
    Save processed data to disk.

    Args:
        X: Feature matrix
        y: Target labels
        vectorizer: Fitted TF-IDF vectorizer
        output_dir (str): Directory to save files
        prefix (str): Prefix for filenames

    Returns:
        dict: Paths to saved files
    """
    import os
    import pickle
    from scipy import sparse

    # Create directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Save features as sparse matrix
    features_path = os.path.join(output_dir, f"{prefix}_features.npz")
    sparse.save_npz(features_path, X)

    # Save labels
    labels_path = os.path.join(output_dir, f"{prefix}_labels.npy")
    np.save(labels_path, y)

    # Save vectorizer
    vectorizer_path = os.path.join(output_dir, f"{prefix}_vectorizer.pkl")
    with open(vectorizer_path, "wb") as f:
        pickle.dump(vectorizer, f)

    return {
        "features": features_path,
        "labels": labels_path,
        "vectorizer": vectorizer_path
    }


def load_processed_data(input_dir="data/processed", prefix="tfidf"):
    """
    Load processed data from disk.

    Args:
        input_dir (str): Directory containing saved files
        prefix (str): Prefix for filenames

    Returns:
        tuple: (X, y, vectorizer) loaded data
    """
    import os
    import pickle
    from scipy import sparse

    # Load features
    features_path = os.path.join(input_dir, f"{prefix}_features.npz")
    X = sparse.load_npz(features_path)

    # Load labels
    labels_path = os.path.join(input_dir, f"{prefix}_labels.npy")
    y = np.load(labels_path)

    # Load vectorizer
    vectorizer_path = os.path.join(input_dir, f"{prefix}_vectorizer.pkl")
    with open(vectorizer_path, "rb") as f:
        vectorizer = pickle.load(f)

    return X, y, vectorizer


def analyze_class_distribution(y, class_names=None):
    """
    Analyze and visualize class distribution.

    Args:
        y: Target labels
        class_names (list): Names of classes

    Returns:
        tuple: (class_counts, class_proportions, figure) distribution statistics and visualization
    """
    import matplotlib.pyplot as plt

    # Count class frequencies
    unique_classes, counts = np.unique(y, return_counts=True)

    # Calculate proportions
    total = len(y)
    proportions = counts / total

    # Prepare class names
    if class_names is None:
        class_names = [f"Class {i}" for i in unique_classes]

    # Create dictionary of counts and proportions
    class_counts = dict(zip(class_names, counts))
    class_proportions = dict(zip(class_names, proportions))

    # Create a bar plot
    fig, ax = plt.subplots(figsize=(10, 6))
    bars = ax.bar(class_names, counts, color=['#3a86ff', '#8338ec', '#ff006e'])

    # Add count and percentage labels
    for i, (bar, count, prop) in enumerate(zip(bars, counts, proportions)):
        height = bar.get_height()
        ax.text(
            bar.get_x() + bar.get_width()/2.,
            height + 0.1,
            f"{count} ({prop:.1%})",
            ha='center', va='bottom'
        )

    # Add labels and title
    ax.set_xlabel('Class')
    ax.set_ylabel('Count')
    ax.set_title('Class Distribution')
    ax.grid(axis='y', linestyle='--', alpha=0.7)

    return class_counts, class_proportions, fig