In [None]:
import re
import os
import json
import time
import emoji
import pandas as pd
from tqdm import tqdm
from googleapiclient import discovery

In [None]:
# file paths of each needed dataset
female_comments_path = "/home/haters/Downloads/loaded_data/Combined_data_29Apr/combined_female_comments.csv"
male_comments_path = "/home/haters/Downloads/loaded_data/Combined_data_29Apr/combined_male_comments.csv"
female_submissions_path = "/home/haters/Downloads/loaded_data/Combined_data_29Apr/combined_female_submissions.csv"
male_submissions_path = "/home/haters/Downloads/loaded_data/Combined_data_29Apr/combined_male_submissions.csv"

# load dataset from the file paths
female_com_df = pd.read_csv(female_comments_path)
male_com_df = pd.read_csv(male_comments_path)
female_sub_df = pd.read_csv(female_submissions_path)
male_sub_df = pd.read_csv(male_submissions_path)

In [None]:
# initial a list of the datasets to be processed
dfs = [female_com_df, male_com_df, female_sub_df, male_sub_df]

# drop 'temp_id' column if it exists
for df in dfs:
    if 'temp_id' in df.columns:
        df.drop(columns=['temp_id'], inplace=True)

In [None]:
# split DataFrame into n parts
def split_dataframe(df, n=3):
    return [df.iloc[i::n, :].reset_index(drop=True) for i in range(n)]

# split DataFrame into 2 parts, then each of those parts into 3 smaller parts
def split_and_subsplit_dataframe(df, split_n=2, subsplit_n=3):
    main_parts = split_dataframe(df, split_n)
    sub_parts = []
    for part in main_parts:
        sub_parts.append(split_dataframe(part, subsplit_n))
    return sub_parts

female_sub_parts = split_dataframe(female_sub_df)
male_com_parts = split_dataframe(male_com_df)
male_sub_parts = split_dataframe(male_sub_df)
female_com_parts = split_and_subsplit_dataframe(female_com_df)

In [None]:
# list of API keys to use for Perspective API (get your own keys at https://console.developers.google.com/)
API_KEYS = [
    'AIzaSyA_uZndSn69dCshlHBt01IZRmmL6GV00eM',
    'AIzaSyCOjCPE66GcfVJHyXkUF72P1ibU6XKz6e4',
    'AIzaSyAU9TrCiUwaSbDkF1CSR9cpKJ-4FJAy-4s', 
    'AIzaSyDPieGPiGBulfG7xcbRSIaj-vG6o_sq0h0', 
    'AIzaSyBHTLT8p7Rp5dkIpQ3zZwwo7NUhc8-DkmY', 
    'AIzaSyC_dlgL-RP6T5dsHe_qSjPQbqtrH_0R6Xc'
]

# create a list of clients using the API keys
clients = [
    discovery.build(
        "commentanalyzer",
        "v1alpha1",
        developerKey=api_key,
        discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
        static_discovery=False,
    ) for api_key in API_KEYS
]

# Basic text preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = emoji.demojize(text)  # Convert emojis to text
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip()

# Function to analyze text using Perspective API with timeout handling
def get_toxicity_and_sexually_explicit_scores(client, text, sleep_time=1, retry_limit=3):
    preprocessed_text = preprocess_text(text)
    analyze_request = {
        'comment': {'text': preprocessed_text},
        'requestedAttributes': {'TOXICITY': {}, 'SEXUALLY_EXPLICIT': {}}
    }
    retries = 0
    while retries < retry_limit:
        try:
            response = client.comments().analyze(body=analyze_request).execute()
            toxicity_score = response['attributeScores']['TOXICITY']['summaryScore']['value']
            sexually_explicit_score = response['attributeScores']['SEXUALLY_EXPLICIT']['summaryScore']['value']
            return toxicity_score, sexually_explicit_score
        except Exception as e:
            print(f"Error analyzing text: {e}. Retrying ({retries + 1}/{retry_limit})...")
            retries += 1
            time.sleep(sleep_time)
    print(f"Skipping text after {retry_limit} retries: {preprocessed_text[:30]}...")  # Log the problematic text
    return None, None

# Function to process DataFrame with multiple API keys
def process_dataframe(df, clients):
    toxicity_scores = []
    sexually_explicit_scores = []
    client_count = len(clients)
    for i, text in enumerate(tqdm(df['body'], desc="Processing rows")):
        client = clients[i % client_count]  # Rotate clients
        toxicity_score, sexually_explicit_score = get_toxicity_and_sexually_explicit_scores(client, text)
        toxicity_scores.append(toxicity_score)
        sexually_explicit_scores.append(sexually_explicit_score)
        if (i + 1) % (50 * client_count) == 0:  # Adjust the batch size as needed
            time.sleep(60)  # Sleep for 60 seconds after every batch
    df['toxicity_score'] = toxicity_scores
    df['sexually_explicit_score'] = sexually_explicit_scores
    return df

# Ensure the output directory exists
output_dir = "/home/haters/Downloads/Toxicity_Detection/output_perspective/output_score/"
os.makedirs(output_dir, exist_ok=True)

# Process each part and save intermediate results
def process_and_save_parts(parts, filename_prefix, clients):
    processed_parts = []
    for i, part in enumerate(parts):
        processed_part = process_dataframe(part, clients)
        processed_part.to_csv(f"{output_dir}{filename_prefix}_part_{i+1}.csv", index=False)
        processed_parts.append(processed_part)
    return processed_parts

In [None]:
female_sub_preprocessed_parts = process_and_save_parts(female_sub_parts, 'female_submissions_outcome', clients)
final_df = pd.concat(female_sub_preprocessed_parts).reset_index(drop=True)
final_df.to_csv('home/haters/Downloads/Toxicity_Detection/output_perspective/output_score/female_submissions_outcome.csv', index=False)

In [None]:
male_sub_preprocessed_parts = process_and_save_parts(male_sub_parts, 'male_submissions_outcome', clients)
final_df = pd.concat(male_sub_preprocessed_parts).reset_index(drop=True)
final_df.to_csv('home/haters/Downloads/Toxicity_Detection/output_perspective/output_score/male_submissions_outcome.csv', index=False)

In [None]:
female_com_preprocessed_parts = process_and_save_parts(female_com_parts, 'female_comments_outcome', clients)
final_df = pd.concat(female_com_preprocessed_parts).reset_index(drop=True)
final_df.to_csv('home/haters/Downloads/Toxicity_Detection/output_perspective/output_score/female_comments_outcome.csv', index=False)