In [None]:
import os
import time
import requests
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
from time import sleep

# Load API key from .env
load_dotenv()
api_key = os.getenv("PERSPECTIVE_API_KEY")
if not api_key:
    raise ValueError("PERSPECTIVE_API_KEY not found in environment")

# Perspective API endpoint
url = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={api_key}"
headers = {"Content-Type": "application/json"}

# Configuration
models   = ['Claude', 'Gem_2.5_F', 'Gem_2.5_P', 'GPT_4.1_mini', 'GPT_4o', 'GPT_o3']
datasets = ['RIR', 'CF']  # loop through both
project_root = Path.cwd()
input_dir    = project_root / "Data" / "Intermediate_Data"
output_dir   = project_root / "Data" / "Analysis_data"
os.makedirs(output_dir, exist_ok=True)
results = []

def score_toxicity_with_backoff(text, max_retries=5, base_sleep=3.0):
    for attempt in range(max_retries):
        payload = {
            "comment": {"text": text},
            "languages": ["en"],
            "requestedAttributes": {"TOXICITY": {}}
        }
        resp = requests.post(url, json=payload, headers=headers)
        if resp.status_code == 429:
            wait = base_sleep * (2 ** attempt)
            print(f"429 Too Many Requests, retry {attempt+1} in {wait:.1f}s")
            time.sleep(wait)
            continue
        resp.raise_for_status()
        return resp.json()["attributeScores"]["TOXICITY"]["summaryScore"]["value"]
    # If all retries fail
    print(f"Failed to score toxicity after {max_retries} retries.")
    return float('nan')

# Loop through datasets and models
for ds in datasets:
    for model in models:
        file_path = input_dir / f"Cleaned_{ds}" / f"Cleaned_{ds}_{model}.xlsx"
        if not file_path.exists():
            print(f"Warning: File not found: {file_path}")
            continue
        df = pd.read_excel(file_path)
        df['Response'] = df['Response'].astype(str)

        # Score each response
        for _, row in df.iterrows():
            text = row['Response'].strip()
            if not text:
                continue
            tox = score_toxicity_with_backoff(text)
            results.append({
                "Model": model,
                "Dataset": ds,
                "Religion": row.get('Religion'),
                "Prompt": row.get('Prompt'),
                "Response": text,
                "Toxicity": tox
            })
            # slight pause to respect rate limits
            sleep(0.1)

# Build cumulative DataFrame and save
results_df = pd.DataFrame(results)
output_path = output_dir / "Toxicity_Scores_CF_RIR.xlsx"
results_df.to_excel(output_path, index=False)
print(f"Saved cumulative toxicity scores to: {output_path}")


429 Too Many Requests, retry 1 in 1.0s
429 Too Many Requests, retry 2 in 2.0s
429 Too Many Requests, retry 3 in 4.0s
429 Too Many Requests, retry 1 in 1.0s
429 Too Many Requests, retry 1 in 1.0s
429 Too Many Requests, retry 2 in 2.0s
429 Too Many Requests, retry 1 in 1.0s
429 Too Many Requests, retry 2 in 2.0s
429 Too Many Requests, retry 3 in 4.0s
429 Too Many Requests, retry 1 in 1.0s
429 Too Many Requests, retry 1 in 1.0s
429 Too Many Requests, retry 2 in 2.0s
429 Too Many Requests, retry 3 in 4.0s
429 Too Many Requests, retry 4 in 8.0s
429 Too Many Requests, retry 5 in 16.0s
Failed to score toxicity after 5 retries.
429 Too Many Requests, retry 1 in 1.0s
429 Too Many Requests, retry 1 in 1.0s
429 Too Many Requests, retry 2 in 2.0s
429 Too Many Requests, retry 3 in 4.0s
429 Too Many Requests, retry 1 in 1.0s
429 Too Many Requests, retry 1 in 1.0s
429 Too Many Requests, retry 2 in 2.0s
429 Too Many Requests, retry 1 in 1.0s
429 Too Many Requests, retry 1 in 1.0s
429 Too Many Requests

In [25]:
import pandas as pd
from pathlib import Path
import os

# Paths
project_root = Path("/mnt/data")  # adjust if your project root differs
input_path = "Data/Analysis_data/Toxicity_Scores_CF_RIR.xlsx"
output_path = "Data/Analysis_data/Toxicity_Averages_by_Model_Religion.xlsx"

# Load the cumulative toxicity data
df = pd.read_excel(input_path)

# Compute average toxicity by Model and Religion
avg_df = (
    df.groupby(['Model', 'Religion'])['Toxicity']
      .mean()
      .reset_index()
)

# Pivot so rows are models, columns are religions
pivot_df = avg_df.pivot(index='Model', columns='Religion', values='Toxicity')

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Save the results
pivot_df.to_excel(output_path)


print(f"Saved averaged toxicity by model and religion to: {output_path}")


Saved averaged toxicity by model and religion to: Data/Analysis_data/Toxicity_Averages_by_Model_Religion.xlsx


In [1]:
import pandas as pd
import os
from pathlib import Path

# —— CONFIGURE PATHS —— #
project_root = Path.cwd()  # adjust if not running from project root
input_path   = project_root / "Data" / "Analysis_data" / "Toxicity_Scores_CF_RIR.xlsx"
output_dir   = project_root / "Data" / "Analysis_data"
output_path  = output_dir / "Toxicity_Averages_with_CV.xlsx"
os.makedirs(output_dir, exist_ok=True)

# —— LOAD DATA —— #
df = pd.read_excel(input_path)

# —— COMPUTE MEANS —— #
# 1) Mean toxicity by Model & Religion
mean_df = (
    df.groupby(['Model','Religion'])['Toxicity']
      .mean()
      .reset_index()
)

# 2) Pivot so rows are models, columns are religions
pivot = mean_df.pivot(index='Model', columns='Religion', values='Toxicity')

# —— ADD AGGREGATES —— #
# Average toxicity across all religions
pivot['Average_Toxicity'] = pivot.mean(axis=1)

# Coefficient of variation (std dev / mean)
pivot['CV_Toxicity'] = pivot.std(axis=1, ddof=0) / pivot['Average_Toxicity']

# —— SAVE RESULT —— #
pivot.to_excel(output_path)
print(f"Saved averaged toxicity + CV to {output_path}")


Saved averaged toxicity + CV to d:\Data Science\BUFinal\Data\Analysis_data\Toxicity_Averages_with_CV.xlsx
