### Quick Description

This notebook calculates the toxicity score from each sentence in a subtitles' file. Finally, it persists the data in a DataFrame like output.

In [None]:
%%capture
!pip install --upgrade google-api-python-client

In [None]:
import yaml
import glob
import os
import pandas as pd
import time
import re
import logging
import sys

from tqdm import tqdm
from googleapiclient import discovery

sys.path.append("../utils")
from toxicity_api_communication import get_toxicity_score

In [None]:
credentials = yaml.load(open("../credentials.yaml"))["perspective-api"]

In [None]:
def calculate_toxicity_scores(credentials, input_path, output_path):
    services = [
        discovery.build(
            "commentanalyzer",
            "v1alpha1",
            developerKey=value,
            discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
            static_discovery=False,
        ) for _, value in credentials.items()
    ]

    n_services = len(services)
    filenames = [file.split("/")[-1] for file in glob.glob(os.path.join(input_path, "*"))]

    for filename in tqdm(filenames, total=len(filenames)):
        df_sentences = pd.read_csv(os.path.join(input_path, filename))
        for i, row in df_sentences.iterrows():
            df_sentences.loc[i, "score"] = get_toxicity_score(services[i % n_services], row.text)

            if (i % n_services) == 0:
                time.sleep(1)

        df_sentences.to_csv(os.path.join(output_path, filename), index=False)

In [None]:
calculate_toxicity_scores(
    credentials,
    "../data/02_segmented/fixed_size_context/without_curse_words",
    "../data/03_scored/without_curse_words"
)