In [1]:
from detoxify import Detoxify
from civirank import parsers, analyzers
import json
%load_ext line_profiler
import pandas as pd
from datasets import Dataset
import time
import torch
import pandas as pd
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load sample data
fname = "twitter_test.json"
with open(fname, "r") as fin:
    sample_data_twitter = json.load(fin)
fname = "reddit_test.json"
with open(fname, "r") as fin:
    sample_data_reddit = json.load(fin)
fname = "facebook_test.json"
with open(fname, "r") as fin:
    sample_data_facebook = json.load(fin)

In [4]:
class ToxicityAnalyzer():
    def __init__(self, model_type='original', batch_size=8):
        # Initialize the Detoxify model with the specified model type
        self.detoxify_model = Detoxify(model_type, device='cuda')
        # adds batch size
        self.batch_size = batch_size

    def get_toxicity_scores(self, text):
        """ Analyze the given text and return toxicity scores """
        assert type(text) in [str, pd.core.frame.DataFrame]
        if type(text) == str:
            results = self.detoxify_model.predict(text)
            return results['toxicity']
        else:
            scores = []
            for i in range(0, len(text), self.batch_size):
                batch = text["text"].iloc[i:i+self.batch_size].tolist()
                results = self.detoxify_model.predict(batch)
                scores.extend(results['toxicity'])
            return scores


class ToxicityAnalyzerONNX():
    def __init__(self, model_name="protectai/unbiased-toxic-roberta-onnx", file_name='model.onnx', batch_size=8, max_length=512):
        # Initialize the ONNX model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = ORTModelForSequenceClassification.from_pretrained(model_name, file_name=file_name, provider="CUDAExecutionProvider")
        self.classifier = pipeline(
            task="text-classification",
            model=self.model,
            tokenizer=self.tokenizer,
            top_k=None,  # Use top_k=None to get all scores
        )
        # Adds batch size
        self.batch_size = batch_size
        # Set max length for input sequences
        self.max_length = max_length

    def get_toxicity_scores(self, text):
        """ Analyze the given text and return toxicity scores """
        assert type(text) in [str, pd.core.frame.DataFrame]

        def extract_toxicity_score(predictions):
            for pred in predictions:
                if pred['label'] == 'toxicity':
                    return pred['score']
            raise ValueError("Toxicity label not found in predictions")

        def truncate_text(text, max_length):
            # Tokenize and truncate the text to the max length
            tokens = self.tokenizer(text, truncation=True, max_length=max_length, return_tensors='pt')
            return self.tokenizer.decode(tokens['input_ids'][0], skip_special_tokens=True)

        if type(text) == str:
            truncated_text = truncate_text(text, self.max_length)
            results = self.classifier(truncated_text)
            return extract_toxicity_score(results[0])
        else:
            scores = []
            for i in range(0, len(text), self.batch_size):
                batch = text["text"].iloc[i:i+self.batch_size].tolist()
                truncated_batch = [truncate_text(t, self.max_length) for t in batch]
                results = self.classifier(truncated_batch)
                batch_scores = [extract_toxicity_score(result) for result in results]
                scores.extend(batch_scores)
            return scores

class ToxicityAnalyzerONNX2():
    def __init__(self, model_name="protectai/unbiased-toxic-roberta-onnx", file_name='model.onnx', batch_size=8, max_length=512):
        # Initialize the ONNX model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = ORTModelForSequenceClassification.from_pretrained(model_name, file_name=file_name,provider="CUDAExecutionProvider")
        self.classifier = pipeline(
            task="text-classification",
            model=self.model,
            tokenizer=self.tokenizer,
            top_k=None,  # Use top_k=None to get all scores
        )
        # Adds batch size
        self.batch_size = batch_size
        # Set max length for input sequences
        self.max_length = max_length

    def get_toxicity_scores(self, text):
        """ Analyze the given text and return toxicity scores """
        assert type(text) in [str, pd.core.frame.DataFrame]

        def extract_toxicity_score(predictions):
            for pred in predictions:
                if pred['label'].lower() in ['toxic', 'toxicity']:
                    return pred['score']
            return 0.0

        def truncate_text(text, max_length):
            # Tokenize and truncate the text to the max length
            tokens = self.tokenizer(text, truncation=True, max_length=max_length, return_tensors='pt')
            return self.tokenizer.decode(tokens['input_ids'][0], skip_special_tokens=True)

        if type(text) == str:
            truncated_text = truncate_text(text, self.max_length)
            results = self.classifier(truncated_text)
            return extract_toxicity_score(results[0])
        else:
            # Prepare the dataset
            text_list = text["text"].tolist()
            truncated_texts = [truncate_text(t, self.max_length) for t in text_list]
            dataset = Dataset.from_dict({"text": truncated_texts})
            
            # Process the dataset in batches
            scores = []
            for i in range(0, len(dataset), self.batch_size):
                batch = dataset.select(range(i, min(i + self.batch_size, len(dataset))))
                results = self.classifier(batch["text"])
                batch_scores = [extract_toxicity_score(result) for result in results]
                scores.extend(batch_scores)
            return scores

class ToxicityAnalyzerONNX3():
    def __init__(self, model_name="protectai/unbiased-toxic-roberta-onnx", batch_size=8, file_name='model.onnx', gpu_id=0):
        # Initialize the ONNX model and tokenizer with the specified model name
        self.model = ORTModelForSequenceClassification.from_pretrained(model_name, file_name=file_name, provider="CUDAExecutionProvider", provider_options={'device_id': gpu_id})
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.batch_size = batch_size

        # Find the index of the 'toxicity' label
        self.toxicity_index = None
        for idx, label in self.model.config.id2label.items():
            if label.lower() == 'toxicity':
                self.toxicity_index = idx
                break
        if self.toxicity_index is None:
            raise ValueError("Toxicity label not found in model's id2label mapping.")

    def classify_texts(self, texts):
        """ Tokenize and classify a batch of texts """
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        outputs = self.model(**inputs)

        probabilities = torch.sigmoid(outputs.logits)

        batch_results = []
        for prob in probabilities:
            result = prob[self.toxicity_index].item()  # Get the probability for the 'toxicity' label
            batch_results.append(result)

        return batch_results

    def get_toxicity_scores(self, text):
        """ Analyze the given text or DataFrame and return toxicity scores """
        assert isinstance(text, (str, pd.DataFrame)), "Input should be either a string or a DataFrame"

        if isinstance(text, str):
            results = self.classify_texts([text])
            return results[0]  # Return the score for the single input string
        else:
            results = []
            for start in range(0, len(text), self.batch_size):
                end = start + self.batch_size
                batch_texts = text["text"].iloc[start:end].tolist()
                batch_results = self.classify_texts(batch_texts)
                results.extend(batch_results)

            results_df = pd.DataFrame(results, index=text.index[:len(results)], columns=["toxicity"])
            return results_df

class ToxicityAnalyzerONNX3Q():
    def __init__(self, model_name="protectai/unbiased-toxic-roberta-onnx", batch_size=8, file_name='model_quantized.onnx', gpu_id=0):
        # Initialize the ONNX model and tokenizer with the specified model name
        self.model = ORTModelForSequenceClassification.from_pretrained(model_name, file_name=file_name, provider="CPUExecutionProvider")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.batch_size = batch_size

        # Find the index of the 'toxicity' label
        self.toxicity_index = None
        for idx, label in self.model.config.id2label.items():
            if label.lower() == 'toxicity':
                self.toxicity_index = idx
                break
        if self.toxicity_index is None:
            raise ValueError("Toxicity label not found in model's id2label mapping.")

    def classify_texts(self, texts):
        """ Tokenize and classify a batch of texts """
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        outputs = self.model(**inputs)

        probabilities = torch.sigmoid(outputs.logits)

        batch_results = []
        for prob in probabilities:
            result = prob[self.toxicity_index].item()  # Get the probability for the 'toxicity' label
            batch_results.append(result)

        return batch_results

    def get_toxicity_scores(self, text):
        """ Analyze the given text or DataFrame and return toxicity scores """
        assert isinstance(text, (str, pd.DataFrame)), "Input should be either a string or a DataFrame"

        if isinstance(text, str):
            results = self.classify_texts([text])
            return results[0]  # Return the score for the single input string
        else:
            results = []
            for start in range(0, len(text), self.batch_size):
                end = start + self.batch_size
                batch_texts = text["text"].iloc[start:end].tolist()
                batch_results = self.classify_texts(batch_texts)
                results.extend(batch_results)

            results_df = pd.DataFrame(results, index=text.index[:len(results)], columns=["toxicity"])
            return results_df

class ToxicityAnalyzerONNX3QRT():
    def __init__(self, model_name="protectai/unbiased-toxic-roberta-onnx", batch_size=8, file_name='model_quantized.onnx', gpu_id=0):
        # Initialize the ONNX model and tokenizer with the specified model name
        self.model = ORTModelForSequenceClassification.from_pretrained(model_name, file_name=file_name, provider="TensorrtExecutionProvider")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.batch_size = batch_size

        # Find the index of the 'toxicity' label
        self.toxicity_index = None
        for idx, label in self.model.config.id2label.items():
            if label.lower() == 'toxicity':
                self.toxicity_index = idx
                break
        if self.toxicity_index is None:
            raise ValueError("Toxicity label not found in model's id2label mapping.")

    def classify_texts(self, texts):
        """ Tokenize and classify a batch of texts """
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        outputs = self.model(**inputs)

        probabilities = torch.sigmoid(outputs.logits)

        batch_results = []
        for prob in probabilities:
            result = prob[self.toxicity_index].item()  # Get the probability for the 'toxicity' label
            batch_results.append(result)

        return batch_results

    def get_toxicity_scores(self, text):
        """ Analyze the given text or DataFrame and return toxicity scores """
        assert isinstance(text, (str, pd.DataFrame)), "Input should be either a string or a DataFrame"

        if isinstance(text, str):
            results = self.classify_texts([text])
            return results[0]  # Return the score for the single input string
        else:
            results = []
            for start in range(0, len(text), self.batch_size):
                end = start + self.batch_size
                batch_texts = text["text"].iloc[start:end].tolist()
                batch_results = self.classify_texts(batch_texts)
                results.extend(batch_results)

            results_df = pd.DataFrame(results, index=text.index[:len(results)], columns=["toxicity"])
            return results_df

def parse_data(dataset):

    LanguageAnalyzer = analyzers.LanguageAnalyzer()

    platform = dataset["session"]["platform"]

    for i in range(len(dataset["items"])):
        dataset['items'][i]['lang'] = LanguageAnalyzer.detect_language(dataset['items'][i]['text'].replace('\n', ' '))

    if platform == "twitter":
        posts = parsers.parse_twitter_posts(dataset["items"])
    elif platform == "reddit":
        posts = parsers.parse_reddit_posts(dataset["items"])
    elif platform == "facebook":
        posts = parsers.parse_facebook_posts(dataset["items"])

    return posts

In [32]:
analyzer = ToxicityAnalyzerONNX3()
text = 'shut up, you idiot!'
print(analyzer.get_toxicity_scores(text))



0.9974874258041382


[0;93m2024-06-15 04:25:10.311073860 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-06-15 04:25:10.311100490 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


In [38]:
v = ToxicityAnalyzerONNX3()
data = parse_data(sample_data_reddit)
tic = time.time()
v.get_toxicity_scores(data)
toc = time.time()
print("Time taken: ", toc-tic)

[0;93m2024-06-15 04:27:11.895393344 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-06-15 04:27:11.895421287 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


Time taken:  0.4575917720794678


In [35]:
v = ToxicityAnalyzerONNX3Q()
data = parse_data(sample_data_reddit)
tic = time.time()
v.get_toxicity_scores(data)
toc = time.time()
print("Time taken: ", toc-tic)



Time taken:  3.2566821575164795


In [37]:
v = ToxicityAnalyzerONNX3Q(file_name='model.onnx')
data = parse_data(sample_data_reddit)
tic = time.time()
v.get_toxicity_scores(data)
toc = time.time()
print("Time taken: ", toc-tic)



Time taken:  7.788755893707275


In [6]:
v = ToxicityAnalyzerONNX3QRT(file_name='model_quantized.onnx')
data = parse_data(sample_data_reddit)
tic = time.time()
v.get_toxicity_scores(data)
toc = time.time()
print("Time taken: ", toc-tic)



*************** EP Error ***************
EP Error /onnxruntime_src/onnxruntime/python/onnxruntime_pybind_state.cc:456 void onnxruntime::python::RegisterTensorRTPluginsAsCustomOps(onnxruntime::python::PySessionOptions&, const ProviderOptions&) Please install TensorRT libraries as mentioned in the GPU requirements page, make sure they're in the PATH or LD_LIBRARY_PATH, and that your GPU is supported.
 when using ['TensorrtExecutionProvider', 'CUDAExecutionProvider']
Falling back to ['CUDAExecutionProvider', 'CPUExecutionProvider'] and retrying.
****************************************


[1;31m2024-06-15 04:29:12.068221320 [E:onnxruntime:Default, provider_bridge_ort.cc:1730 TryGetProviderInfo_TensorRT] /onnxruntime_src/onnxruntime/core/session/provider_bridge_ort.cc:1426 onnxruntime::Provider& onnxruntime::ProviderLibrary::Get() [ONNXRuntimeError] : 1 : FAIL : Failed to load library libonnxruntime_providers_tensorrt.so with error: libnvinfer.so.10: cannot open shared object file: No such file or directory
[m
[0;93m2024-06-15 04:29:12.195785637 [W:onnxruntime:, transformer_memcpy.cc:74 ApplyImpl] 294 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m


Time taken:  4.874242305755615


In [32]:
analyzer = ToxicityAnalyzer()
text = 'shut up, you idiot!'
print(analyzer.get_toxicity_scores(text))



0.99506074


In [33]:
v = ToxicityAnalyzer()
data = parse_data(sample_data_reddit)
%lprun -u 1 -f v.get_toxicity_scores v.get_toxicity_scores(data)

Timer unit: 1 s

Total time: 0.755877 s
File: /tmp/ipykernel_454289/4292279429.py
Function: get_toxicity_scores at line 8

Line #      Hits         Time  Per Hit   % Time  Line Contents
     8                                               def get_toxicity_scores(self, text):
     9                                                   """ Analyze the given text and return toxicity scores """
    10         1          0.0      0.0      0.0          assert type(text) in [str, pd.core.frame.DataFrame]
    11         1          0.0      0.0      0.0          if type(text) == str:
    12                                                       results = self.detoxify_model.predict(text)
    13                                                       return results['toxicity']
    14                                                   else:
    15         1          0.0      0.0      0.0              scores = []
    16        21          0.0      0.0      0.0              for i in range(0, len(text), se

In [6]:
analyzer = ToxicityAnalyzer()
data = parse_data(sample_data_reddit)
tic = time.time()
analyzer.get_toxicity_scores(data)
toc = time.time()
print(toc - tic)


0.6244995594024658


In [42]:
v = ToxicityAnalyzerONNX2(file_name='model_quantized.onnx')
data = parse_data(sample_data_reddit)
%lprun -u 1 -f v.get_toxicity_scores v.get_toxicity_scores(data)

[0;93m2024-06-15 03:38:35.758374975 [W:onnxruntime:, transformer_memcpy.cc:74 ApplyImpl] 294 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2024-06-15 03:38:35.761604234 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-06-15 03:38:35.761610035 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


Timer unit: 1 s

Total time: 5.84949 s
File: /tmp/ipykernel_454289/4292279429.py
Function: get_toxicity_scores at line 84

Line #      Hits         Time  Per Hit   % Time  Line Contents
    84                                               def get_toxicity_scores(self, text):
    85                                                   """ Analyze the given text and return toxicity scores """
    86         1          0.0      0.0      0.0          assert type(text) in [str, pd.core.frame.DataFrame]
    87                                           
    88         1          0.0      0.0      0.0          def extract_toxicity_score(predictions):
    89                                                       for pred in predictions:
    90                                                           if pred['label'].lower() in ['toxic', 'toxicity']:
    91                                                               return pred['score']
    92                                                      

In [5]:
analyzer = ToxicityAnalyzerONNX()
text = 'shut up, you idiot!'
print(analyzer.get_toxicity_scores(text))



0.9974874258041382


[0;93m2024-06-15 03:00:26.144434704 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-06-15 03:00:26.144460263 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


In [15]:
v = ToxicityAnalyzerONNX()
data = parse_data(sample_data_reddit)
%lprun -u 1 -f v.get_toxicity_scores v.get_toxicity_scores(data)

[0;93m2024-06-15 03:05:29.174982030 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-06-15 03:05:29.175009432 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


Timer unit: 1 s

Total time: 1.20212 s
File: /tmp/ipykernel_454289/1449302690.py
Function: get_toxicity_scores at line 39

Line #      Hits         Time  Per Hit   % Time  Line Contents
    39                                               def get_toxicity_scores(self, text):
    40                                                   """ Analyze the given text and return toxicity scores """
    41         1          0.0      0.0      0.0          assert type(text) in [str, pd.core.frame.DataFrame]
    42                                           
    43         1          0.0      0.0      0.0          def extract_toxicity_score(predictions):
    44                                                       for pred in predictions:
    45                                                           if pred['label'] == 'toxicity':
    46                                                               return pred['score']
    47                                                       raise ValueError("

In [16]:
v = ToxicityAnalyzerONNX()
data = parse_data(sample_data_reddit)
tic = time.time()
v.get_toxicity_scores(data)
toc = time.time()
print(toc - tic)


[0;93m2024-06-15 03:05:33.748742712 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-06-15 03:05:33.748769082 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


1.0257039070129395


In [29]:
v = ToxicityAnalyzerONNX2()
data = parse_data(sample_data_reddit)
%lprun -u 1 -f v.get_toxicity_scores v.get_toxicity_scores(data)

[0;93m2024-06-15 03:09:44.733053865 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-06-15 03:09:44.733084253 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


Timer unit: 1 s

Total time: 1.19457 s
File: /tmp/ipykernel_454289/4292279429.py
Function: get_toxicity_scores at line 84

Line #      Hits         Time  Per Hit   % Time  Line Contents
    84                                               def get_toxicity_scores(self, text):
    85                                                   """ Analyze the given text and return toxicity scores """
    86         1          0.0      0.0      0.0          assert type(text) in [str, pd.core.frame.DataFrame]
    87                                           
    88         1          0.0      0.0      0.0          def extract_toxicity_score(predictions):
    89                                                       for pred in predictions:
    90                                                           if pred['label'].lower() in ['toxic', 'toxicity']:
    91                                                               return pred['score']
    92                                                      

In [28]:
v = ToxicityAnalyzerONNX2()
data = parse_data(sample_data_reddit)
tic = time.time()
v.get_toxicity_scores(data)
toc = time.time()
print(toc - tic)


[0;93m2024-06-15 03:09:39.196171842 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-06-15 03:09:39.196220154 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


1.137446403503418
