### Quick Description

This notebook calculates the toxicity score from each sentence in a subtitles' file. Finally, it persists the data in a DataFrame like output.

In [1]:
%%capture
!pip install --upgrade google-api-python-client

In [2]:
import yaml
import glob
import os
import pandas as pd
import time
import re
import logging
import sys

from tqdm import tqdm
from googleapiclient import discovery

sys.path.append("../utils")
from toxicity_api_communication import get_toxicity_score

In [3]:
filepaths = yaml.load(open("../config/filepaths.yaml"))
credentials = yaml.load(open("../config/credentials.yaml"))["perspective-api"]

  filepaths = yaml.load(open("../config/filepaths.yaml"))
  credentials = yaml.load(open("../config/credentials.yaml"))["perspective-api"]


In [4]:
def calculate_toxicity_scores(credentials, input_path, output_path):
    services = [
        discovery.build(
            "commentanalyzer",
            "v1alpha1",
            developerKey=value,
            discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
            static_discovery=False,
        ) for _, value in credentials.items()
    ]

    n_services = len(services)
    filenames = [file.split("/")[-1] for file in glob.glob(os.path.join(input_path, "*"))]

    for filename in tqdm(filenames, total=len(filenames)):
        df_sentences = pd.read_csv(os.path.join(input_path, filename))
        for i, row in df_sentences.iterrows():
            df_sentences.loc[i, "perspective_score"] = get_toxicity_score(
                services[i % n_services],
                row.text,
                "TOXICITY"
            )

            if (i % n_services) == 0:
                time.sleep(1)

        df_sentences.to_csv(os.path.join(output_path, filename), index=False)

In [5]:
calculate_toxicity_scores(
    credentials,
    filepaths["model_based_segmentated_data"],
    filepaths["scored_data"]
)

  1%|▍                                                             | 4/634 [05:47<14:36:38, 83.49s/it]ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Tue, 24 May 2022 12:20:44 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '482', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: ca",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_AT

ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Tue, 24 May 2022 14:08:52 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '482', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: ht",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_ATTRIBUTE",\n        "languageNotSupportedByAttributeError": {\n          "detectedLanguages": [\n      

 20%|███████████▉                                               | 128/634 [2:47:11<6:51:12, 48.76s/it]ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Tue, 24 May 2022 15:02:32 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '492', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: ja-Latn",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_

 27%|███████████████▌                                         | 173/634 [3:43:45<14:18:18, 111.71s/it]ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Tue, 24 May 2022 15:59:12 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '492', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: el-Latn",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_

 32%|██████████████████▉                                        | 203/634 [4:25:25<6:39:36, 55.63s/it]ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Tue, 24 May 2022 16:41:25 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '492', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: ar-Latn",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_

ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Tue, 24 May 2022 17:41:06 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '482', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: co",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_ATTRIBUTE",\n        "languageNotSupportedByAttributeError": {\n          "detectedLanguages": [\n      

 47%|███████████████████████████▌                              | 301/634 [6:38:05<9:25:08, 101.83s/it]ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Tue, 24 May 2022 18:54:16 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '482', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: hu",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_AT

 60%|███████████████████████████████████▋                       | 383/634 [8:29:01<4:23:24, 62.96s/it]ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Tue, 24 May 2022 20:43:59 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '482', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: ro",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_AT

 71%|████████████████████████████████████████▏                | 447/634 [9:56:59<10:40:23, 205.48s/it]ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Tue, 24 May 2022 22:12:43 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '484', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: lus",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_A

 79%|█████████████████████████████████████████████▋            | 499/634 [11:12:02<3:23:08, 90.28s/it]ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Tue, 24 May 2022 23:27:20 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '482', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: rw",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_AT

 88%|██████████████████████████████████████████████████▊       | 555/634 [12:24:06<1:26:42, 65.86s/it]ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Wed, 25 May 2022 00:39:42 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '482', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: hr",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_AT

ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Wed, 25 May 2022 01:38:04 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '484', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: und",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_ATTRIBUTE",\n        "languageNotSupportedByAttributeError": {\n          "detectedLanguages": [\n     

 97%|██████████████████████████████████████████████████████████  | 613/634 [13:53:09<25:30, 72.86s/it]ERROR:root:The following error occured: 
({'vary': 'Origin, X-Origin, Referer', 'content-type': 'application/json; charset=UTF-8', 'date': 'Wed, 25 May 2022 02:08:33 GMT', 'server': 'ESF', 'cache-control': 'private', 'x-xss-protection': '0', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'transfer-encoding': 'chunked', 'status': '400', 'content-length': '484', '-content-encoding': 'gzip'}, b'{\n  "error": {\n    "code": 400,\n    "message": "Attribute TOXICITY does not support request languages: und",\n    "status": "INVALID_ARGUMENT",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.commentanalyzer.v1alpha1.Error",\n        "errorType": "LANGUAGE_NOT_SUPPORTED_BY_A