### Quick Description

This notebook calculates the toxicity score from each sentence in a subtitles' file. Finally, it persists the data in a DataFrame like output.

In [1]:
%%capture
!pip install --upgrade google-api-python-client

In [2]:
import yaml
import glob
import os
import pandas as pd
import time
import re
import logging
import sys

from tqdm import tqdm
from googleapiclient import discovery

sys.path.append("../utils")
from toxicity_api_communication import get_toxicity_score

In [3]:
filepaths = yaml.load(open("../config/filepaths.yaml"))
credentials = yaml.load(open("../config/credentials.yaml"))["perspective-api"]

  filepaths = yaml.load(open("../config/filepaths.yaml"))
  credentials = yaml.load(open("../config/credentials.yaml"))["perspective-api"]


In [4]:
def calculate_toxicity_scores(credentials, input_path, output_path):
    services = [
        discovery.build(
            "commentanalyzer",
            "v1alpha1",
            developerKey=value,
            discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
            static_discovery=False,
        ) for _, value in credentials.items()
    ]

    n_services = len(services)
    filenames = [file.split("/")[-1] for file in glob.glob(os.path.join(input_path, "*"))]

    for filename in tqdm(filenames, total=len(filenames)):
        df_sentences = pd.read_csv(os.path.join(input_path, filename))
        for i, row in df_sentences.iterrows():
            df_sentences.loc[i, "score"] = get_toxicity_score(
                services[i % n_services],
                row.text,
                "TOXICITY"
            )

            if (i % n_services) == 0:
                time.sleep(1)

        df_sentences.to_csv(os.path.join(output_path, filename), index=False)

In [5]:
calculate_toxicity_scores(
    credentials,
    filepaths["model_based_segmentated_data"],
    filepaths["perspective_scored_data"]
)

  5%|██▊                                                           | 25/552 [23:42<6:56:52, 47.46s/it]ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Thu, 17 Mar 2022 04:16:57 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '484', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: haw",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_A

 12%|███████▏                                                      | 64/552 [54:38<4:11:28, 30.92s/it]ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Thu, 17 Mar 2022 04:47:42 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '492', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: ja-Latn",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_

 14%|████████▌                                                   | 79/552 [1:07:29<7:26:36, 56.65s/it]ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Thu, 17 Mar 2022 05:00:20 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '482', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: da",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_AT

 28%|████████████████▊                                          | 157/552 [2:10:32<6:28:02, 58.94s/it]ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Thu, 17 Mar 2022 06:04:37 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '482', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: hr",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_AT

 43%|█████████████████████████▏                                 | 236/552 [3:27:45<4:03:47, 46.29s/it]ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Thu, 17 Mar 2022 07:21:18 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '482', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: ms",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_AT

 51%|██████████████████████████████                             | 281/552 [4:11:48<4:53:30, 64.98s/it]ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Thu, 17 Mar 2022 08:05:06 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '482', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: no",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_AT

 62%|████████████████████████████████████▍                      | 341/552 [5:12:16<5:33:29, 94.83s/it]ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Thu, 17 Mar 2022 09:05:52 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '482', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: vi",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_AT

 72%|██████████████████████████████████████████▋                | 399/552 [6:05:49<1:38:01, 38.44s/it]ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Thu, 17 Mar 2022 09:59:19 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '482', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: az",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_AT

 79%|██████████████████████████████████████████████▊            | 438/552 [6:42:40<2:09:32, 68.18s/it]ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Thu, 17 Mar 2022 10:36:42 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '484', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: und",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_A

ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Thu, 17 Mar 2022 11:15:42 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '492', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: ja-Latn",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_ATTRIBUTE",\n        "languageNotSupportedByAttributeError": {\n          "detectedLanguages": [\n 

ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Thu, 17 Mar 2022 12:34:29 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '482', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: ha",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_ATTRIBUTE",\n        "languageNotSupportedByAttributeError": {\n          "detectedLanguages": [\n      