In [None]:
# Settings

PERSPECTIVE_API = None
PERSPECTIVE_QUOTA = None

# Location of the yearly files containing sentences flagged as hateful by the FB Roberta HS model
TARGET_FOLDER = None

In [14]:
import json
import aiohttp
import asyncio
import math
import os
import pandas as pd  
import nest_asyncio  # Required for Jupyter Notebooks to allow nested asyncio loops
from aiohttp import ClientSession

# Apply nested asyncio to allow event loop usage in Jupyter
nest_asyncio.apply()

# Define the Google Perspective API URL and your API key
API_URL = "https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze"
API_KEY = PERSPECTIVE_API

# Semaphore to control the request rate and avoid exceeding quota (100 requests per second)
semaphore = asyncio.Semaphore(PERSPECTIVE_QUOTA)

# Asynchronous function to call the Perspective API for a given sentence
async def google_perspective_predict_async(session, sentence, max_retries=math.inf):
    headers = {"Content-Type": "application/json"}
    payload = {
        'comment': {'text': sentence},
        'requestedAttributes': {'IDENTITY_ATTACK': {}},
        'languages': ["en"],
    }
    params = {'key': API_KEY}

    retries = 0
    while retries < max_retries:
        async with semaphore:
            try:
                async with session.post(API_URL, headers=headers, params=params, json=payload) as response:
                    if response.status == 200:
                        result = await response.json()
                        return result['attributeScores']['IDENTITY_ATTACK']['summaryScore']['value']
                    elif response.status == 429:  # Quota exceeded error
                        #print(f"Quota exceeded: Retrying in {2 ** retries} seconds...")
                        await asyncio.sleep(2 ** retries)  # Exponential backoff
                    elif response.status == 400:
                        error_message = await response.json()
                        # Check if the error is due to text being too long
                        if "Comment text was too many bytes" in error_message.get('error', {}).get('message', ''):
                            print(f"Skipping sentence due to byte size limit: {len(sentence.encode('utf-8'))} bytes.")
                            return None  # Skip retrying if the text is too long
                        print(f"400 Error: {error_message} for sentence: {sentence}")
                    else:
                        print(f"Error: {response.status}. Retrying...")
                        await asyncio.sleep(1)
            except aiohttp.ClientError as e:
                print(f"Request failed: {e}. Retrying...")
                await asyncio.sleep(1)

        retries += 1

    # If max retries are exceeded, return None
    print(f"Failed to process sentence after {max_retries} retries: {sentence}")
    return None

# Asynchronous function to process sentences and save results to a CSV file
async def process_file(input_file, output_file):
    # Read input JSONL file
    with open(input_file, 'r') as infile:
        sentences = [json.loads(line) for line in infile]

    # Use a single session for all requests
    async with aiohttp.ClientSession() as session:
        tasks = [asyncio.ensure_future(google_perspective_predict_async(session, sentence)) for sentence in sentences]
        scores = await asyncio.gather(*tasks)

    # Prepare data for CSV
    csv_data = [{"text": sentence, "score": score} for sentence, score in zip(sentences, scores) if score is not None and score > 0.1]

    # Save the results to a CSV file
    df = pd.DataFrame(csv_data)
    df.to_csv(output_file, index=False)
    print(f"Saved results for {input_file} to {output_file}")

# Main function to process all JSONL files in a folder
async def main(folder_path):
    # Ensure pandas is installed (for saving results to CSV)
    try:
        import pandas as pd
    except ImportError:
        print("Pandas is not installed. Please install it using `pip install pandas` and run the code again.")
        return

    # Iterate through all JSONL files in the specified folder
    for filename in os.listdir(folder_path):
        output_folder = folder_path + "/Perspective"
        if filename.endswith(".jsonl"):
            input_file = os.path.join(folder_path, filename)
            output_file = os.path.join(output_folder, filename.replace(".jsonl", "_results.csv"))
            
            # Check if the output file already exists
            if os.path.exists(output_file):
                print(f"File {output_file} already exists. Skipping this file.")
                continue
            
            # Process each file and save results to CSV
            await process_file(input_file, output_file)

# Run the main function using an event loop
if __name__ == "__main__":
    # Replace with your desired folder path
    folder_path = TARGET_FOLDER
    asyncio.run(main(folder_path))


File /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1750_results.csv already exists. Skipping this file.
File /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1751_results.csv already exists. Skipping this file.
File /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1752_results.csv already exists. Skipping this file.
File /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1753_results.csv already exists. Skipping this file.
File /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1754_results.csv already exists. Skipping this file.
File /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1755_results.csv already exists. Skipping this file.
File /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1756_results.csv already exists. Skipping this file.
File /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1758_results.csv already exists. Skipping this file.
File /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1759_results.csv already exists. Skipping this file.
File /mnt/

Saved results for /mnt/c/Users/larsj/hate_sents_2/results_1910.jsonl to /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1910_results.csv
Saved results for /mnt/c/Users/larsj/hate_sents_2/results_1911.jsonl to /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1911_results.csv
Saved results for /mnt/c/Users/larsj/hate_sents_2/results_1912.jsonl to /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1912_results.csv
Error: 502. Retrying...
Error: 502. Retrying...
Error: 502. Retrying...
Saved results for /mnt/c/Users/larsj/hate_sents_2/results_1913.jsonl to /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1913_results.csv
Saved results for /mnt/c/Users/larsj/hate_sents_2/results_1914.jsonl to /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1914_results.csv
Saved results for /mnt/c/Users/larsj/hate_sents_2/results_1915.jsonl to /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1915_results.csv
Saved results for /mnt/c/Users/larsj/hate_sents_2/results_1916.jsonl to /mnt/c/Use

Saved results for /mnt/c/Users/larsj/hate_sents_2/results_1967.jsonl to /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1967_results.csv
Saved results for /mnt/c/Users/larsj/hate_sents_2/results_1968.jsonl to /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1968_results.csv
Saved results for /mnt/c/Users/larsj/hate_sents_2/results_1969.jsonl to /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1969_results.csv
Saved results for /mnt/c/Users/larsj/hate_sents_2/results_1970.jsonl to /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1970_results.csv
Saved results for /mnt/c/Users/larsj/hate_sents_2/results_1971.jsonl to /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1971_results.csv
Saved results for /mnt/c/Users/larsj/hate_sents_2/results_1972.jsonl to /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1972_results.csv
Saved results for /mnt/c/Users/larsj/hate_sents_2/results_1973.jsonl to /mnt/c/Users/larsj/hate_sents_2/Perspective/results_1973_results.csv
Saved results

In [5]:
import json
import aiohttp
import asyncio
import math
import os
import pandas as pd  # Import pandas for CSV saving
import nest_asyncio  # Required for Jupyter Notebooks to allow nested asyncio loops
from aiohttp import ClientSession

all_sentences = []

# Asynchronous function to process sentences and save results to a CSV file
def process_file(input_file, output_file):
    with open(input_file, 'r') as infile:
        sentences = [json.loads(line) for line in infile]
    
    return sentences


output_folder = TARGET_FOLDER
output_file = os.path.join(output_folder, "roberta_hatefull_all.txt")
# Iterate through all JSONL files in the specified folder
for filename in os.listdir(folder_path):
    if filename.endswith(".jsonl"):
        input_file = os.path.join(folder_path, filename)
        # Process each file and save results to CSV
        all_sentences.extend(process_file(input_file, output_file))

size = len(all_sentences)

# Write the collected sentences to the output text file
with open(output_file, 'w', encoding='utf-8') as outfile:
    for sentence in all_sentences:
        outfile.write(sentence + "\n")

print(f"Aggregated {size} sentences to {output_file}.")


3343433
Aggregated None sentences to /mnt/c/Users/larsj/hate_sents_2/roberta_hatefull_all.txt.


In [7]:
import os
import pandas as pd

# Define the folder containing the results CSV files
results_folder = f"{TARGET_FOLDER}/Perspective"

# Define a variable threshold that can be set to control filtering
score_threshold = 0.9

# Construct the output file name dynamically based on the threshold value
output_folder = os.path.join(results_folder, "Thresholds")
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists
output_txt_file = os.path.join(output_folder, f"aggregated_sentences_above_{str(score_threshold).replace('.', '_')}.txt")

# Initialize a list to store sentences with scores >= the defined threshold
sentences_above_threshold = []

# Loop through all files in the results folder
for filename in os.listdir(results_folder):
    if filename.endswith("_results.csv"):  # Only process results CSV files
        file_path = os.path.join(results_folder, filename)
        
        try:
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
        except pd.errors.EmptyDataError:
            print(f"Skipping {filename} as it contains no columns to parse.")
            continue
            
        # Check if the required columns exist in the CSV
        if 'text' in df.columns and 'score' in df.columns:
            # Filter sentences with score >= the specified threshold
            high_score_sentences = df[df['score'] >= score_threshold]['text'].tolist()
            sentences_above_threshold.extend(high_score_sentences)
        else:
            print(f"Skipping {filename} as it doesn't contain the expected columns.")

# Write the collected sentences to the dynamically named output text file
with open(output_txt_file, 'w', encoding='utf-8') as outfile:
    for sentence in sentences_above_threshold:
        outfile.write(sentence + "\n")

print(f"Aggregated {len(sentences_above_threshold)} sentences with score >= {score_threshold} to {output_txt_file}.")


Skipping results_1758_results.csv as it contains no columns to parse.
Skipping results_1984_results.csv as it contains no columns to parse.
Skipping results_1991_results.csv as it contains no columns to parse.
Skipping results_1999_results.csv as it contains no columns to parse.
Aggregated 11 sentences with score >= 0.9 to /mnt/c/Users/larsj/hate_sents_2/Perspective/Thresholds/aggregated_sentences_above_0_9.txt.


In [11]:
import os
import pandas as pd
import json

def filter_sentences_by_threshold(input_folder, output_file, threshold):
    # List to store the filtered JSON objects
    json_lines = []

    # Iterate through all files in the folder
    for filename in os.listdir(input_folder):
        if filename.startswith("results_") and filename.endswith("_results.csv"):
            # Extract the year from the filename
            year = filename.split('_')[1]

            # Read the CSV file
            file_path = os.path.join(input_folder, filename)
            try:
                # Read the CSV file into a DataFrame
                df = pd.read_csv(file_path)
            except pd.errors.EmptyDataError:
                print(f"Skipping {year} as it contains no columns to parse.")
                continue

            # Filter rows where the score exceeds the threshold
            filtered_df = df[df['score'] >= threshold]

            # If there are any rows that match the criteria, create a JSON object for that year
            if not filtered_df.empty:
                json_object = {
                    "year": int(year),
                    "text": filtered_df['text'].tolist()  # Convert text column to list
                }
                # Append the JSON object to the list
                json_lines.append(json_object)

    # Write the JSON objects to the output file in JSON Lines format
    with open(output_file, 'w') as outfile:
        for entry in json_lines:
            json.dump(entry, outfile)
            outfile.write('\n')

    print(f"Filtered results saved to {output_file}")

threshold = 0.5  # Set the threshold for filtering


input_folder = f"{TARGET_FOLDER}/Perspective"
# Construct the output file name dynamically based on the threshold value
output_folder = os.path.join(results_folder, "Datasets")
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists
output_file = os.path.join(output_folder, f"aggregated_sentences_above_{str(threshold).replace('.', '_')}.jsonl")

filter_sentences_by_threshold(input_folder, output_file, threshold)



Skipping 1758 as it contains no columns to parse.
Skipping 1984 as it contains no columns to parse.
Skipping 1991 as it contains no columns to parse.
Skipping 1999 as it contains no columns to parse.
Filtered results saved to /mnt/c/Users/larsj/hate_sents_2/Perspective/Datasets/aggregated_sentences_above_0_5.jsonl


In [11]:
# create statistics about number of hateful senteces for different perspective API thresholds
import os
import pandas as pd
import json

def perspective_stats_by_threshold(input_folder, output_file):
    # All threshold values
    thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    
    results = {}
    
    # Iterate through all files in the folder
    for filename in os.listdir(input_folder):
        if filename.startswith("results_") and filename.endswith("_results.csv"):
            # Extract the year from the filename
            year = filename.split('_')[1]
            # Read the CSV file
            file_path = os.path.join(input_folder, filename)
            
            results[year] = {}
            try:
                # Read the CSV file into a DataFrame
                df = pd.read_csv(file_path)
            except pd.errors.EmptyDataError:
                print(f"Assigning {year} 0 values as it contains no columns to parse.")
                for threshold in thresholds:
                    results[year][threshold] = 0
                continue
                
            
            for threshold in thresholds:
                count = (df['score'] >= threshold).sum()
                results[year][threshold] = count
                
    # Convert the dictionary to a pandas DataFrame
    df = pd.DataFrame(results).T  # Transpose to make years rows and thresholds columns
    df.columns = [f"Threshold {t}" for t in thresholds]  # Rename columns for readability
    # Reset index to make the 'Year' column and set it as the first column
    df.reset_index(inplace=True)
    df.rename(columns={'index': 'Year'}, inplace=True)
    print(f"saved statistics to {output_file}")
    stats_df.to_csv(output_file)
        
    return df

threshold = 0.5  # Set the threshold for filtering


input_folder = f"{TARGET_FOLDER}/Perspective"
output_folder = input_folder
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists
output_file = os.path.join(output_folder, f"perspective_api_yearly_stats.csv")

stats_df = perspective_stats_by_threshold(input_folder, output_file)
display(stats_df)
stats_df.to_csv(output_file)


Assigning 1758 0 values as it contains no columns to parse.
Assigning 1984 0 values as it contains no columns to parse.
Assigning 1991 0 values as it contains no columns to parse.
Assigning 1999 0 values as it contains no columns to parse.
saved statistics to /mnt/c/Users/larsj/hate_sents_2/Perspective/perspective_api_yearly_stats.csv


Unnamed: 0,Year,Threshold 0.1,Threshold 0.2,Threshold 0.3,Threshold 0.4,Threshold 0.5,Threshold 0.6,Threshold 0.7,Threshold 0.8,Threshold 0.9
0,1750,69,40,22,14,6,4,0,0,0
1,1751,145,80,44,16,7,2,0,0,0
2,1752,18,10,8,3,1,0,0,0,0
3,1753,225,122,62,31,15,6,1,0,0
4,1754,46,26,13,7,2,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
244,1995,25,18,7,4,2,0,0,0,0
245,1996,16,6,2,2,0,0,0,0,0
246,1997,4,2,0,0,0,0,0,0,0
247,1999,0,0,0,0,0,0,0,0,0


In [19]:
import pandas as pd
import random

# Set a seed for reproducibility
SEED = 42
random.seed(SEED)

threshold = 0.6

# Define file paths
input_file = f"{TARGET_FOLDER}/Perspective/Thresholds/aggregated_sentences_above_{str(treshold).replace('.', '_')}.txt"  # Input file containing sentences
output_file = f"{TARGET_FOLDER}/Perspective/Thresholds/{str(treshold).replace('.', '_')}_sample_chunk.csv"  # Output CSV file

# Read sentences from the input file
with open(input_file, 'r') as file:
    sentences = file.readlines()

# Remove any trailing newlines
sentences = [sentence.strip() for sentence in sentences]

# Calculate the number of sentences in each chunk
num_sentences = len(sentences)
chunk_size = num_sentences // 100

# Split sentences into 100 equal chunks
chunks = [sentences[i * chunk_size: (i + 1) * chunk_size] for i in range(100)]

# Draw one random sample from each chunk
samples = [random.choice(chunk) for chunk in chunks]

# Create a DataFrame to store the samples
df = pd.DataFrame({
    "sentence": samples,
    "hateful": [''] * len(samples)  # Empty column for 'hateful'
})

# Save the DataFrame to a CSV file
df.to_csv(output_file, index=False)

print(f"Sampled sentences saved to {output_file}")


Sampled sentences saved to /mnt/c/Users/larsj/hate_sents_2/Perspective/Thresholds/0_6_sample_chunk.csv


In [14]:
import os
import pandas as pd

# Define the folder containing the results CSV files
results_folder = f"{TARGET_FOLDER}/Perspective"

# Define a variable threshold that can be set to control filtering
score_threshold = 0.3
upper_threshold = score_threshold + 0.1

# Construct the output file name dynamically based on the threshold value
output_folder = os.path.join(results_folder, "Thresholds")
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists
output_txt_file = os.path.join(output_folder, f"aggregated_sentences_above_{str(score_threshold).replace('.', '_')}_delta.txt")

# Initialize a list to store sentences with scores >= the defined threshold
sentences_above_threshold = []

# Loop through all files in the results folder
for filename in os.listdir(results_folder):
    if filename.endswith("_results.csv"):  # Only process results CSV files
        file_path = os.path.join(results_folder, filename)
        
        try:
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
        except pd.errors.EmptyDataError:
            print(f"Skipping {filename} as it contains no columns to parse.")
            continue
            
        # Check if the required columns exist in the CSV
        if 'text' in df.columns and 'score' in df.columns:
            # Filter sentences with score >= the specified threshold
            high_score_sentences = df[(df['score'] >= score_threshold) & (df['score'] < upper_threshold)]['text'].tolist()
            sentences_above_threshold.extend(high_score_sentences)
        else:
            print(f"Skipping {filename} as it doesn't contain the expected columns.")

# Write the collected sentences to the dynamically named output text file
with open(output_txt_file, 'w', encoding='utf-8') as outfile:
    for sentence in sentences_above_threshold:
        outfile.write(sentence + "\n")

print(f"Aggregated {len(sentences_above_threshold)} sentences with score >= {score_threshold} and score < {score_threshold+0.1} to {output_txt_file}.")



Skipping results_1758_results.csv as it contains no columns to parse.
Skipping results_1984_results.csv as it contains no columns to parse.
Skipping results_1991_results.csv as it contains no columns to parse.
Skipping results_1999_results.csv as it contains no columns to parse.
Aggregated 148038 sentences with score >= 0.3 to /mnt/c/Users/larsj/hate_sents_2/Perspective/Thresholds/aggregated_sentences_above_0_3_delta.txt.
