In [None]:
import threading
import time
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
input_file = "/content/drive/MyDrive/toxic/bolsonaro_filtered.xlsx"
output_file = "/content/drive/MyDrive/toxic/bolsonaro_perspective.xlsx"

In [None]:
API_KEY = ""
URL = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={API_KEY}"

In [None]:
session = requests.Session()
retry_strategy = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.mount("http://", adapter)

In [None]:
RATE_LIMIT = 600
PERIOD = 60
lock = threading.Lock()
calls = 0
last_reset = time.time()

In [None]:
def rate_limit():
    global calls, last_reset
    with lock:
        now = time.time()
        if now - last_reset > PERIOD:
            last_reset = now
            calls = 0
        if calls >= RATE_LIMIT:
            sleep_for = PERIOD - (now - last_reset)
            if sleep_for > 0:
                time.sleep(sleep_for)
            last_reset = time.time()
            calls = 0
        calls += 1

In [None]:
def analyze_tweet_limited(tweet: str) -> dict:
    rate_limit()
    payload = {
        "comment": {"text": tweet},
        "languages": ["pt"],
        "requestedAttributes": {
            "TOXICITY": {},
            "SEVERE_TOXICITY": {},
            "IDENTITY_ATTACK": {},
            "INSULT": {},
            "PROFANITY": {},
            "THREAT": {},
        },
    }
    try:
        resp = session.post(URL, json=payload, timeout=10)
        resp.raise_for_status()
        scores = resp.json()["attributeScores"]
        return {k: v["summaryScore"]["value"] for k, v in scores.items()}
    except Exception as e:
        return {"error": str(e)}

In [None]:
def process_tweets(tweets, max_workers=10):
    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        return list(
            tqdm(
                pool.map(analyze_tweet_limited, tweets),
                total=len(tweets),
                desc="Analyzing tweets",
            )
        )

In [None]:
df = pd.read_excel(input_file)

In [None]:
tweets = df["text_processed"].tolist()
scores_list = process_tweets(tweets, max_workers=10)

In [None]:
scores_df = pd.json_normalize(scores_list)

In [None]:
df_all_attr = pd.concat(
        [df.reset_index(drop=True)[["id", "text", "text_processed"]], scores_df],
        axis=1,
    )

In [None]:
df_all_attr["is_toxic"] = df_all_attr["TOXICITY"] >= 0.7

In [None]:
df_all_attr['id'] = df_all_attr['id'].astype(str)
df_all_attr[["id", "text", "is_toxic"]].to_excel(output_file, index=False)
df_all_attr.to_excel("/content/drive/MyDrive/toxic/bolsonaro_attributes.xlsx", index=False)