### Quick Description

This notebook calculates the toxicity score from each sentence in a subtitles' file. Finally, it persists the data in a DataFrame like output.

In [None]:
%%capture
!pip install --upgrade google-api-python-client

In [None]:
import yaml
import glob
import os
import pandas as pd
import time
import re
import logging

from tqdm import tqdm
from googleapiclient import discovery

In [None]:
credentials = yaml.load(open("../credentials.yaml"))["perspective-api"]

In [None]:
def get_toxicity_score(service, text):
    toxicity_score = None

    try:
        if len(text) > 0 and len(text) < 3000:
            analyze_request = {
                'comment': {'text': text},
                'requestedAttributes': {'TOXICITY': {}}
            }
            response = service.comments().analyze(body=analyze_request).execute()
            toxicity_score = (
                response.get("attributeScores")
                .get("TOXICITY")
                .get("summaryScore")
                .get("value")
            )
    except Exception as e:
        logging.error(f"The following error occured: {e.args}")
    
    return toxicity_score

In [None]:
services = [
    discovery.build(
        "commentanalyzer",
        "v1alpha1",
        developerKey=credentials["key-1"],
        discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
        static_discovery=False,
    ),
    discovery.build(
        "commentanalyzer",
        "v1alpha1",
        developerKey=credentials["key-2"],
        discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
        static_discovery=False,
    ),
    discovery.build(
        "commentanalyzer",
        "v1alpha1",
        developerKey=credentials["key-3"],
        discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
        static_discovery=False,
    )
]

n_services = len(services)

In [None]:
input_path = "../data/02_segmented/fixed_size_context"
output_path = "../data/03_scored"
filenames = [file.split("/")[-1] for file in glob.glob(os.path.join(input_path, "*"))]

for filename in tqdm(filenames, total=len(filenames)):
    df_sentences = pd.read_csv(os.path.join(input_path, filename))
    for i, row in df_sentences.iterrows():
        df_sentences.loc[i, "score"] = get_toxicity_score(services[i % n_services], row.text)
        
        if (i % n_services) == 0:
            time.sleep(1)

    df_sentences.to_csv(os.path.join(output_path, filename), index=False)