### Quick Description

This notebook calculates the toxicity score from each sentence in a subtitles' file. Finally, it persists the data in a DataFrame like output.

In [1]:
import yaml
import glob
import os
import pandas as pd
import sys

from tqdm import tqdm
from flair.models import TextClassifier
from flair.data import Sentence

sys.path.append("../../../utils")
from absolute_path_builder import AbsolutePathBuilder

2023-05-29 10:44:47.958911: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-29 10:44:48.008046: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-05-29 10:44:48.008059: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
dataset = "buckeye"

input_path = AbsolutePathBuilder.get_path(
    f"04_{dataset}_scored",
    filepaths="../../../config/filepaths.yaml"
)

In [3]:
def calculate_toxicity_scores(save_dir):
    classifier = TextClassifier.load('en-sentiment')
    filenames = [file.split("/")[-1] for file in glob.glob(os.path.join(save_dir, "*"))]

    for filename in tqdm(filenames, total=len(filenames)):
        df_sentences = pd.read_csv(os.path.join(save_dir, filename))
        for i, row in df_sentences.iterrows():
            sentence = Sentence(row.text)
            classifier.predict(sentence)

            df_sentences.loc[i, "flair_score"] = (
                -sentence.labels[0]._score
                if sentence.labels[0]._value == "NEGATIVE"
                else sentence.labels[0]._score
            )

        df_sentences.to_csv(os.path.join(save_dir, filename), index=False)

In [4]:
calculate_toxicity_scores(input_path)

2022-10-14 07:01:34,367 loading file /home/guilherme/.flair/models/sentiment-en-mix-distillbert_4.pt


100%|█████████████████████████████████████████████████████████████████| 39/39 [05:28<00:00,  8.41s/it]


In [3]:
classifier = TextClassifier.load('en-sentiment')

sentences = [
    "all my n*ggas on the porch and neva ina house",
    "all my friends on the porch and never in the house",
    "you're black",
    "you're white",
    "cant fuhgit you",
    "I can't forget you"
]

scored = {}
for s in sentences:
    sentence = Sentence(s)
    classifier.predict(sentence)
    
    scored[s] = (
        -sentence.labels[0]._score
        if sentence.labels[0]._value == "NEGATIVE"
        else sentence.labels[0]._score
    )

2023-05-29 10:46:03,893 loading file /home/guilherme/.flair/models/sentiment-en-mix-distillbert_4.pt


In [4]:
scored

{'all my n*ggas on the porch and neva ina house': -0.7296380400657654,
 'all my friends on the porch and never in the house': 0.9830883145332336,
 "you're black": 0.9994387030601501,
 "you're white": 0.9992408752441406,
 'cant fuhgit you': 0.9111694693565369,
 "I can't forget you": 0.995732843875885}