### Quick Description

This notebook calculates the toxicity score from each sentence in a subtitles' file. Finally, it persists the data in a DataFrame like output.

In [8]:
%%capture
!pip install --upgrade google-api-python-client

In [2]:
import yaml
import glob
import os
import pandas as pd
import time
import re
import logging
import sys

from tqdm import tqdm
from googleapiclient import discovery

sys.path.append("../utils")
from toxicity_api_communication import get_toxicity_score

In [9]:
filepaths = yaml.safe_load(open("../config/filepaths.yaml"))
credentials = yaml.load(open(filepaths['credentials']))["perspective-api"]

In [10]:
def calculate_toxicity_scores(credentials, input_path, output_path):
    services = [
        discovery.build(
            "commentanalyzer",
            "v1alpha1",
            developerKey=value,
            discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
            static_discovery=False,
        ) for _, value in credentials.items()
    ]

    n_services = len(services)
    filenames = [file.split("/")[-1] for file in glob.glob(os.path.join(input_path, "*"))]

    for filename in tqdm(filenames, total=len(filenames)):
        df_sentences = pd.read_csv(os.path.join(input_path, filename))
        for i, row in df_sentences.iterrows():
            df_sentences.loc[i, "perspective_score"] = get_toxicity_score(
                services[i % n_services],
                row.text,
                "TOXICITY"
            )

            if (i % n_services) == 0:
                time.sleep(1)

        df_sentences.to_csv(os.path.join(output_path, filename), index=False)

In [12]:
calculate_toxicity_scores(
    credentials,
    filepaths["coraal_sentences"],
    filepaths["coraal_scored"]
)

  0%|          | 0/108 [00:00<?, ?it/s]ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Thu, 02 Jun 2022 18:00:40 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '482', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: no",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_ATTRIBUTE",\n        "languageNotSupportedByAttributeError": {\n 