In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
# Load cached data from Detoxify model on Jigsaw dataset. See https://github.com/unitaryai/detoxify for details.
# The comments are from Wikipedia talk channels, and we are trying perform outlier detection
# We will only use the non-toxic data, and then with type-1 error control identify the toxic outliers.
data = np.load('data/toxic-text/toxic-text-detoxify.npz')
preds = data['preds'] # Toxicity score in [0,1]
toxic = data['labels'] # Toxic (1) or not (0)

In [3]:
# Problem setup
alpha = 0.1 # 1-alpha is the desired type-1 error
n = 10000 # Use 200 calibration points

In [4]:
# Look at only the non-toxic data
nontoxic = toxic == 0
preds_nontoxic = preds[nontoxic]
preds_toxic = preds[np.invert(nontoxic)]

# Split nontoxic data into calibration and validation sets (save the shuffling)
idx = np.array([1] * n + [0] * (preds_nontoxic.shape[0]-n)) > 0
np.random.shuffle(idx)
cal_scores, val_scores = preds_nontoxic[idx], preds_nontoxic[np.invert(idx)]

### Conformal outlier detection happens here

In [5]:
# Use the outlier detection method to get a threshold on the toxicities
qhat = np.quantile(cal_scores, np.ceil((n+1)*(1-alpha))/n)
# Perform outlier detection on the ind and ood data
outlier_ind = val_scores > qhat # We want this to be no more than alpha on average
outlier_ood = preds_toxic > qhat # We want this to be as large as possible, but it doesn't have a guarantee

In [6]:
# Calculate type-1 and type-2 errors
type1 = outlier_ind.mean()
type2 = 1-outlier_ood.mean()
print(f"The type-1 error is {type1:.4f}, the type-2 error is {type2:.4f}, and the threshold is {qhat:.4f}.")

The type-1 error is 0.0994, the type-2 error is 0.2993, and the threshold is 0.4857.


In [7]:
# Show some examples of unflagged and flagged text
content = pd.read_csv('generation-scripts/toxic_text_utils/test.csv')['content']
print("Unflagged text examples:")
print(list(np.random.choice(content[preds <= qhat],size=(5,))))
print("\n\nFlagged text examples:")
print(list(np.random.choice(content[preds > qhat],size=(5,))))

Unflagged text examples:
['E eu não posso xingar uma pessoa que não tem que temer? E essa pessoa não tem o direito a não passar por um calvário que eu lhe mova? Eduardo 話 ', 'La référence à suivre c est le CINB qui ne met pas d espace. Et au passage, en typographie, une espace est un mot féminin. Et encore au passage, il ne faut pas mettre de « & nbsp; » dans wikipédia le logiciel gère tout seul les espaces insécables avant les double ponctuation ou entre les guillemets. TED ', 'Утвержден Постановлением Правительства Российской Федерации от 17 ноября 2010 г. N 928 ПЕРЕЧЕНЬ АВТОМОБИЛЬНЫХ ДОРОГ ОБЩЕГО ПОЛЬЗОВАНИЯ ФЕДЕРАЛЬНОГО ЗНАЧЕНИЯ (в ред. Постановлений Правительства РФ от 18.01.2011 N 10,от 21.02.2011 N 96, от 05.09.2011 N 744, от 29.11.2011 N 990, от 30.12.2011 N 1207) А-121  Сортавала  Санкт-Петербург - Сортавала - автомобильная дорога Р-21  Кола  Идентификационный номер автомобильной дороги 00 ОП ФЗ А-121 автомобильная дорога от Санкт-Петербурга через Приозерск, Сортавалу до Петро