### Quick Description
This notebook uses [Unitary AI toxicity scoring models](https://github.com/unitaryai/detoxify) to score different datasets

In [1]:
import yaml
import glob
import os
import pandas as pd

from tqdm import tqdm
from detoxify import Detoxify

In [2]:
dataset = "buckeye"

filepaths = yaml.load(open("/home/guilherme/Desktop/dissertation/config/filepaths.yaml"), Loader=yaml.FullLoader)

In [3]:
def calculate_toxicity_scores(save_dir):
    filenames = [file.split("/")[-1] for file in glob.glob(os.path.join(save_dir, "*"))]
    
    original_model = Detoxify("original")
    unbiased_model = Detoxify("unbiased")
    multilingual_model = Detoxify("multilingual")

    for filename in tqdm(filenames, total=len(filenames)):
        df_sentences = pd.read_csv(os.path.join(save_dir, filename))
        for i, row in df_sentences.iterrows():
            score = original_model.predict(row.text)
            df_sentences.loc[i, "detoxify_original_score"] = score["toxicity"]
            
            score = unbiased_model.predict(row.text)
            df_sentences.loc[i, "detoxify_unbiased_score"] = score["toxicity"]
            
            score = multilingual_model.predict(row.text)
            df_sentences.loc[i, "detoxify_multilingual_score"] = score["toxicity"]

        df_sentences.to_csv(os.path.join(save_dir, filename), index=False)

In [4]:
calculate_toxicity_scores(filepaths[f"04_{dataset}_scored"])

100%|█████████████████████████████████████████████████████████████████| 39/39 [35:57<00:00, 55.31s/it]
