In [None]:
# Silence Warnings (HuggingFace internal warnings)

%env PYTHONWARNINGS=ignore
import warnings
warnings.filterwarnings("ignore")
import os

In [2]:
import json
import fasttext
import pandas as pd
import cudf
import dask_cudf
import numpy as np
import cupy as cp
from pathlib import Path
from typing import Optional, Tuple, Any, Dict, List
from huggingface_hub import hf_hub_download

from nemo_curator import get_client
from nemo_curator.classifiers import FineWebNemotronEduClassifier, FineWebMixtralEduClassifier
from nemo_curator.datasets import DocumentDataset
from nemo_curator.utils.distributed_utils import load_object_on_worker
from nemo_curator.utils.distributed_utils import get_device_total_memory

### Initializing NeMo Curator Client
This step initializes the NeMo Curator client to enable distributed classification using GPU-based processing.

In [None]:
%env CUDA_VISIBLE_DEVICES 3

In [None]:
client = get_client(cluster_type="gpu")

### Setting Output File Paths
Defines the paths where classification results, threshold values, and final bucketed results will be stored.

In [5]:
# Define output directories
OUTPUT_BASE_DIR = "output_data_dir/"
OUTPUT_CLASSIFICATION_RESULTS = os.path.join(OUTPUT_BASE_DIR, "classification_results")
OUTPUT_CLASSIFIER_THRESHOLDS = os.path.join(OUTPUT_BASE_DIR, "classifier_thresholds.json")
OUTPUT_BUCKETED_RESULTS = os.path.join(OUTPUT_BASE_DIR, "bucketed_results")

# Preparing Text Data for Classification
- We create a sample dataset with diverse topics.
- Optionally, users can provide a directory containing JSONL files for classification.

In [6]:
# Create sample DataFrame
text = [
    "Quantum computing is set to revolutionize the field of cryptography.",
    "Investing in index funds is a popular strategy for long-term financial growth.",
    "Recent advancements in gene therapy offer new hope for treating genetic disorders.",
    "Online learning platforms have transformed the way students access educational resources.",
    "Traveling to Europe during the off-season can be a more budget-friendly option.",
    "Training regimens for athletes have become more sophisticated with the use of data analytics.",
    "Streaming services are changing the way people consume television and film content.",
    "Vegan recipes have gained popularity as more people adopt plant-based diets.",
    "Climate change research is critical for developing sustainable environmental policies.",
    "Telemedicine has become increasingly popular due to its convenience and accessibility.",
]
df = cudf.DataFrame({"text": text})
input_dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))
write_to_filename = False

# Alternatively, read existing directory of JSONL files
# input_file_path="/input_data_dir/"
# input_dataset = DocumentDataset.read_json(
#     input_file_path, backend="cudf", add_filename=True
# )
# write_to_filename = True

# Step 1: Run the Classifiers

1. Compute the floating-point classification score for each classifier.

**Note:** Dask operations are lazy, meaning the classifiers won’t execute until an eager operation like `to_json`, `compute`, or `persist` is called.

### FastText Quality Classifier

The **FastText Quality Classifier** uses the [`fasttext-oh-eli5`](https://huggingface.co/mlfoundations/fasttext-oh-eli5) model from Hugging Face to assess text quality. It distinguishes **high-quality** (`__label__hq`) responses from lower-quality ones (`__label__cc`).  

NeMo Curator allows users to define custom modules like this, enabling seamless integration of specialized models.  

- **Model:** [`mlfoundations/fasttext-oh-eli5`](https://huggingface.co/mlfoundations/fasttext-oh-eli5)  
- **Training Data:** Reddit ELI5 vs. Wikipedia (200k examples)  
- **Output:** Confidence score + optional binary classification (where 1 represents high quality text and 0 represents low quality text)  

🔗 **More details:** [Hugging Face Model Card](https://huggingface.co/mlfoundations/fasttext-oh-eli5)

In [7]:
class FastTextQualityClassifier:
    """
    A classifier that uses a fastText model to predict a confidence score for text.

    It appends one or two output columns to the data:
      - A float column representing the confidence score.
      - Optionally, an integer column (1 if the top label contains "hq", else 0).

    The model is loaded from the Hugging Face Hub during initialization.

    Args:
        pred_column (str): Name of the output column for the confidence score.
        int_column (str, optional): Name of the output column for the binary indicator.
                                    If not provided, only the pred_column is added.
    """

    def __init__(self, pred_column: str, int_column: Optional[str] = None) -> None:
        self.pred_column: str = pred_column
        self.int_column: Optional[str] = int_column

        self.repo_id: str = "mlfoundations/fasttext-oh-eli5"
        self.model_filename: str = "openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train.bin"
        # Download the fastText model from Hugging Face Hub.
        self.model_path: str = hf_hub_download(repo_id=self.repo_id, filename=self.model_filename)
        self.model_identifier: str = f"{self.repo_id}/{self.model_filename}"

    def _load_fasttext_model(self) -> Any:
        """Load and return the fastText model."""
        return fasttext.load_model(self.model_path)

    def predict_text(self, text: str) -> Tuple[float, int]:
        """
        Predict the confidence score and binary indicator for a given text.

        Args:
            text (str): The input text to classify.

        Returns:
            Tuple[float, int]: A tuple containing the confidence score (float) and binary indicator (int).
        """
        model = load_object_on_worker(self.model_identifier, self._load_fasttext_model, {})
        predictions = model.predict(text, k=2)  
        # predictions[0]: labels, predictions[1]: scores
        # If the top predicted label contains "hq", return the first score; otherwise, use the second.
        if "hq" in predictions[0][0]:
            return predictions[1][0], 1
        else:
            return predictions[1][1], 0

    def _predict_on_partition(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Apply predictions to a pandas DataFrame partition.

        Assumes the DataFrame has a "text" column.

        Args:
            df (pd.DataFrame): Input DataFrame partition.

        Returns:
            pd.DataFrame: DataFrame with added prediction columns.
        """
        # Load the model on the worker.
        model = load_object_on_worker(self.model_identifier, self._load_fasttext_model, {})
        results = df["text"].apply(self.predict_text)
        df[self.pred_column] = results.apply(lambda x: x[0]).astype(np.float32)
        if self.int_column is not None:
            df[self.int_column] = results.apply(lambda x: x[1]).astype(np.int32)
        return df

    def __call__(self, dataset: DocumentDataset) -> DocumentDataset:
        """
        Apply the classifier to a distributed dataset.

        The dataset should have a "text" column. The classifier converts the dataset
        to a pandas backend, applies predictions to each partition, and then converts the result
        back to cudf.

        Args:
            dataset: A distributed DataFrame (e.g., a Dask DataFrame) containing a "text" column.

        Returns:
            DocumentDataset: The dataset with added prediction columns.
        """
        meta = dataset.df._meta
        if hasattr(meta, "to_pandas"):
            meta = meta.to_pandas()
        meta[self.pred_column] = np.float32(0.0)
        if self.int_column is not None:
            meta[self.int_column] = np.int32(0)

        processed_df = dataset.df.to_backend("pandas").map_partitions(self._predict_on_partition, meta=meta)
        processed_df = processed_df.to_backend("cudf")
        return DocumentDataset(processed_df)


In [None]:
# Define classifier score mapping
classifier_scores = {
    "nemotron-score": {
        "int_score": "fineweb-nemotron-edu-score-int",
        "float_score": "fineweb-nemotron-edu-score"
    },
    "mixtral-score": {
        "int_score": "fineweb-mixtral-edu-score-int",
        "float_score": "fineweb-mixtral-edu-score"
    },
    "fasttext-score": {
        "int_score": "fasttext-quality-score-int",
        "float_score": "fasttext-quality-score"
    }
}



# Initialize classifiers
classifiers = [
    FineWebNemotronEduClassifier(batch_size=1024,
                         pred_column=classifier_scores["nemotron-score"]["float_score"],
                         int_column=classifier_scores["nemotron-score"]["int_score"]),
    FineWebMixtralEduClassifier(batch_size=1024,
                         pred_column=classifier_scores["mixtral-score"]["float_score"],
                         int_column=classifier_scores["mixtral-score"]["int_score"]),
    FastTextQualityClassifier(pred_column=classifier_scores["fasttext-score"]["float_score"],
                         int_column=classifier_scores["fasttext-score"]["int_score"])
]

In [9]:
client.cluster.close()
client.shutdown()

In [None]:
output_dataset = input_dataset
for classifier in classifiers:
    output_dataset = classifier(dataset=output_dataset)

# Dropping int columns
# As we add new based on a threshold (in the following columns)
output_dataset = output_dataset.df.drop(columns=[v["int_score"] for v in classifier_scores.values()])
output_dataset.to_parquet(path=OUTPUT_CLASSIFICATION_RESULTS)

In [9]:
del classifiers, output_dataset, input_dataset

### Read Back in the scored Data Frame

In [None]:
scored_data = DocumentDataset.read_parquet(OUTPUT_CLASSIFICATION_RESULTS, backend="cudf")
scored_data.df.head()

# Step 2: Compute Score Thresholds

### Why Compute Thresholds?
- To categorize classification scores into percentile-based bins.
- Ensures results are comparable across different classifiers.

### Approach:
1. **Extract classifier scores** from the sampled dataset.
2. **Compute weighted percentiles** for each classifier.
3. **Save percentile thresholds** for later use in mapping scores.

> **Note:** The percentile calculation is weighted by token count so that longer texts (with more tokens) have a greater impact on the thresholds. This ensures that the bins accurately reflect the distribution of content, giving a more meaningful categorization of the scores.

In [11]:
def weighted_percentile(data, percentiles, weights):
    """
    Compute weighted percentiles with the "inverted_cdf" method.

    Parameters:
      data : array-like, the data values.
      percentiles : scalar or array-like, percentiles in [0, 100].
      weights : array-like, the weights for each data value.
    
    Returns:
      The weighted percentile values.
    """
    data = np.asarray(data)
    weights = np.asarray(weights)
    
    # Sort data and associated weights
    sorter = np.argsort(data)
    data_sorted = data[sorter]
    weights_sorted = weights[sorter]
    
    # Compute the cumulative sum of weights and normalize it to [0, 1]
    cum_weights = np.cumsum(weights_sorted)
    total_weight = cum_weights[-1]
    normalized_cum_weights = cum_weights / total_weight

    # For each desired percentile, find the first data value where
    # the normalized cumulative weight is >= (percentile / 100).
    percentiles = np.atleast_1d(percentiles)
    results = []
    for p in percentiles:
        # np.searchsorted returns the index where (p/100) should be inserted 
        # to maintain order.
        idx = np.searchsorted(normalized_cum_weights, p / 100.0, side='left')
        results.append(data_sorted[idx])
    
    return np.array(results)


def compute_thresholds(score_ar: np.ndarray, token_ar: np.ndarray) -> Dict[str, float]:
    """
    Compute percentile-based thresholds for a given score column using weighted percentiles.

    Args:
        score_ar (np.ndarray): Array containing the scores.
        token_ar (np.ndarray): Array containing token counts for weighting.

    Returns:
        Dict[str, float]: Dictionary containing percentile thresholds.
    """
    percentiles = np.arange(5, 100, 5)
    # NumPy < 2.0 does not support the "inverted_cdf" method for computing percentiles 
    # with weights directly via np.percentile (see commented-out equivalent code below).
    # To achieve the same result, we manually implement the weighted percentile computation
    # using NumPy primitives.
    # thresholds = np.percentile(cc_df_score, percentiles, weights=cc_df_tokens, method='inverted_cdf')
    thresholds = weighted_percentile(score_ar, percentiles, weights=token_ar)
    return {int(percentile): float(thresh) for percentile, thresh in zip(percentiles, thresholds)}


def compute_thresholds_for_score_columns(
    df: cudf.DataFrame, text_col_name: str, score_col_names: List[str]
) -> Dict[str, Dict[str, float]]:
    """
    Compute percentile-based thresholds for all specified score columns in a DataFrame.

    Args:
        df (cudf.DataFrame): The DataFrame containing the score columns and text column.
        text_col_name (str): The name of the text column used to derive token counts.
        score_col_names (List[str]): List of column names for which thresholds should be computed.

    Returns:
        Dict[str, Dict[str, float]]: A dictionary mapping each score column to its percentile thresholds.
    """
    threshold_dict = {}
    token_series = df[text_col_name].str.byte_count()

    for score_col in score_col_names:
        threshold_dict[score_col] = compute_thresholds(df[score_col].values.get(), token_series.values.get())

    return threshold_dict


def save_thresholds(threshold_dict: Dict[str, Dict[str, float]],  file_name) -> None:
    """
    Save computed thresholds to a JSON file.

    Args:
        threshold_dict (Dict[str, Dict[str, float]]): The dictionary containing computed thresholds.
        file_name (str, optional): The name of the output JSON file. Defaults to "thresholds.json".
    Returns:
        None
    """
    with open(file_name, 'w') as fout:
        json.dump(file_name, fout, indent=4)
    print(f"Thresholds saved to {file_name}")

In [None]:
# Adjust fraction based on how much can fit in a single GPU (1/2 ish)
gpu_memory_available = get_device_total_memory()/2
frac = max(1, scored_data.df.memory_usage(deep=True).sum().compute()/gpu_memory_available)
sampled_data =  scored_data.df.sample(frac=frac).repartition(npartitions=1)

score_col_names = [v["float_score"] for v in classifier_scores.values()]
threshold_dict = sampled_data.map_partitions(compute_thresholds_for_score_columns, text_col_name="text", score_col_names=score_col_names).compute().iloc[0]
save_thresholds(threshold_dict, OUTPUT_CLASSIFIER_THRESHOLDS)

In [None]:
threshold_dict

# Step 3: Convert Floating-Point Scores to Integer Scores

### Why Convert?
- Floating-point scores are mapped to integer categories (0-19) for easier comparison.
- Integer scores are computed using **percentile-based thresholds**.

### Process:
1. **Retrieve percentile thresholds** from saved JSON.
2. **Apply the thresholds to map scores to integer bins**.
3. **Store integer scores in the dataset** for final ensemble computation.

In [14]:
def map_scores(df, score_col_name: str, score_int_name: str, bins: List[float]):
    """
    Given a DataFrame df and a column of original scores, 
    use cp.digitize to map them into integer bins using the given thresholds.
    """
    pred_orig_score = cp.array(df[score_col_name])
    pred_int_score = cp.digitize(pred_orig_score, bins)
    df[score_int_name] = pred_int_score
    return df

def map_score_columns(df: cudf.DataFrame, score_col_names: List[str], threshold_dict: Dict[str, dict]):
    """
    For each score column in score_col_names, this function:
      1. Creates a new column name by appending '-int'
      2. Retrieves the corresponding thresholds from threshold_dict,
         sorts them (using the keys which are assumed to be strings of numbers),
      3. Passes the bins to map_scores to create the integer score column.
    """
    for score_col_name in score_col_names:
        # Build the new integer score column name.
        score_int_name = score_col_name + "-int"
        thresholds = threshold_dict.get(score_col_name)
        if thresholds is None:
            raise ValueError(f"No thresholds found for score column '{score_col_name}'")
        
        sorted_keys = sorted(thresholds.keys(), key=lambda x: int(x))
        # Use cp.array to create a CuPy array from the list of threshold values.
        bins = cp.array([thresholds[k] for k in sorted_keys])
        
        # Map the original score column to the new integer score column.
        df = map_scores(df, score_col_name, score_int_name, bins)
    return df


In [None]:
scored_data.df = scored_data.df.map_partitions(map_score_columns, score_col_names, threshold_dict)
scored_data.head()

# Step 4: Compute the Final Ensembled Score

### Purpose:
- To combine the predictions from multiple classifiers into a **single representative score**.
- The ensemble score is computed as the **maximum of all integer scores** across classifiers.

### Approach:
1. **Extract integer scores from each classifier.**
2. **Compute the max integer score for each data point.**
3. **Store the final ensemble score in the dataset.**

In [16]:
int_column_names = [f'{v["float_score"]}-int' for v in classifier_scores.values()]
scored_data.df['ensemble-max-int'] = scored_data.df[int_column_names].max(axis=1)

In [None]:
scored_data.df.head()

# Step 5: Write Results to Partitioned Buckets


### Purpose:
- Organize and store classified results in a **structured, partitioned format** to facilitate **annealing-based training** for downstream **LLM fine-tuning** and optimization.

In [None]:
scored_data.to_parquet(OUTPUT_BUCKETED_RESULTS, partition_on="ensemble-max-int")

# Verify Results

### Process:
1. **List available partitions** (each corresponds to a score bucket).
2. **Read a sample partition** and validate data integrity.

In [None]:
all_buckets = sorted(os.listdir(OUTPUT_BUCKETED_RESULTS))
print(all_buckets)
first_bucket= DocumentDataset.read_parquet(os.path.join(OUTPUT_BUCKETED_RESULTS, all_buckets[0]))
first_bucket.head()